Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
8a4b5978
Commit
8a4b5978
authored
May 22, 2019
by
Chao Liu
Browse files
adding implicit gemm v3
parent
2a48812e
Changes
26
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
270 additions
and
197 deletions
+270
-197
driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
...er/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
+4
-3
driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
...er/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
+3
-2
driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
...er/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
+4
-4
driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
...er/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
+25
-29
driver/driver.hip.cpp
driver/driver.hip.cpp
+5
-5
src/include/Array.hip.hpp
src/include/Array.hip.hpp
+39
-4
src/include/ConstantMatrixDescriptor.hip.hpp
src/include/ConstantMatrixDescriptor.hip.hpp
+9
-9
src/include/ConstantMergedTensorDescriptor.hip.hpp
src/include/ConstantMergedTensorDescriptor.hip.hpp
+23
-20
src/include/ConstantTensorDescriptor.hip.hpp
src/include/ConstantTensorDescriptor.hip.hpp
+31
-16
src/include/Sequence.hip.hpp
src/include/Sequence.hip.hpp
+7
-7
src/include/blockwise_2d_tensor_op.hip.hpp
src/include/blockwise_2d_tensor_op.hip.hpp
+9
-5
src/include/blockwise_4d_tensor_op.hip.hpp
src/include/blockwise_4d_tensor_op.hip.hpp
+8
-7
src/include/blockwise_gemm.hip.hpp
src/include/blockwise_gemm.hip.hpp
+2
-2
src/include/blockwise_merged_tensor_slice_op.hip.hpp
src/include/blockwise_merged_tensor_slice_op.hip.hpp
+74
-63
src/include/blockwise_tensor_slice_op.hip.hpp
src/include/blockwise_tensor_slice_op.hip.hpp
+4
-4
src/include/common.hip.hpp
src/include/common.hip.hpp
+9
-3
src/include/conv_common.hip.hpp
src/include/conv_common.hip.hpp
+2
-2
src/include/functional.hip.hpp
src/include/functional.hip.hpp
+1
-1
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp
...plicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp
+4
-4
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
...plicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
+7
-7
No files found.
driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
View file @
8a4b5978
...
@@ -38,7 +38,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
...
@@ -38,7 +38,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
constexpr
index_t
X
=
wei_kcyx_desc
.
GetLength
(
I3
);
constexpr
index_t
X
=
wei_kcyx_desc
.
GetLength
(
I3
);
// reorder weight
// reorder weight
auto
wei_cyxk_desc
=
make_
packed_
ConstantTensorDescriptor
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
auto
wei_cyxk_desc
=
make_ConstantTensorDescriptor
_default_rank_packed
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
ostream_ConstantTensorDescriptor
(
wei_cyxk_desc
,
std
::
cout
<<
"wei_cyxk_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_cyxk_desc
,
std
::
cout
<<
"wei_cyxk_desc: "
);
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
...
@@ -51,7 +51,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
...
@@ -51,7 +51,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
std
::
thread
::
hardware_concurrency
());
std
::
thread
::
hardware_concurrency
());
// reorder input
// reorder input
auto
in_chwn_desc
=
make_
packed_
ConstantTensorDescriptor
(
Sequence
<
C
,
Hi
,
Wi
,
N
>
{});
auto
in_chwn_desc
=
make_ConstantTensorDescriptor
_default_rank_packed
(
Sequence
<
C
,
Hi
,
Wi
,
N
>
{});
ostream_ConstantTensorDescriptor
(
in_chwn_desc
,
std
::
cout
<<
"in_chwn_desc: "
);
ostream_ConstantTensorDescriptor
(
in_chwn_desc
,
std
::
cout
<<
"in_chwn_desc: "
);
Tensor
<
T
>
in_chwn
(
make_TensorDescriptor
(
in_chwn_desc
));
Tensor
<
T
>
in_chwn
(
make_TensorDescriptor
(
in_chwn_desc
));
...
@@ -64,7 +64,8 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
...
@@ -64,7 +64,8 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
std
::
thread
::
hardware_concurrency
());
std
::
thread
::
hardware_concurrency
());
// output
// output
auto
out_khwn_desc
=
make_packed_ConstantTensorDescriptor
(
Sequence
<
K
,
Ho
,
Wo
,
N
>
{});
auto
out_khwn_desc
=
make_ConstantTensorDescriptor_default_rank_packed
(
Sequence
<
K
,
Ho
,
Wo
,
N
>
{});
ostream_ConstantTensorDescriptor
(
out_khwn_desc
,
std
::
cout
<<
"out_khwn_desc: "
);
ostream_ConstantTensorDescriptor
(
out_khwn_desc
,
std
::
cout
<<
"out_khwn_desc: "
);
Tensor
<
T
>
out_khwn
(
make_TensorDescriptor
(
out_khwn_desc
));
Tensor
<
T
>
out_khwn
(
make_TensorDescriptor
(
out_khwn_desc
));
...
...
driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
View file @
8a4b5978
...
@@ -37,7 +37,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
...
@@ -37,7 +37,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
constexpr
index_t
X
=
wei_kcyx_desc
.
GetLength
(
I3
);
constexpr
index_t
X
=
wei_kcyx_desc
.
GetLength
(
I3
);
// reorder weight
// reorder weight
auto
wei_cyxk_desc
=
make_
packed_
ConstantTensorDescriptor
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
auto
wei_cyxk_desc
=
make_ConstantTensorDescriptor
_default_rank_packed
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
ostream_ConstantTensorDescriptor
(
wei_cyxk_desc
,
std
::
cout
<<
"wei_cyxk_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_cyxk_desc
,
std
::
cout
<<
"wei_cyxk_desc: "
);
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
...
@@ -50,7 +50,8 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
...
@@ -50,7 +50,8 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
std
::
thread
::
hardware_concurrency
());
std
::
thread
::
hardware_concurrency
());
// output
// output
auto
out_khwn_desc
=
make_packed_ConstantTensorDescriptor
(
Sequence
<
K
,
Ho
,
Wo
,
N
>
{});
auto
out_khwn_desc
=
make_ConstantTensorDescriptor_default_rank_packed
(
Sequence
<
K
,
Ho
,
Wo
,
N
>
{});
ostream_ConstantTensorDescriptor
(
out_khwn_desc
,
std
::
cout
<<
"out_khwn_desc: "
);
ostream_ConstantTensorDescriptor
(
out_khwn_desc
,
std
::
cout
<<
"out_khwn_desc: "
);
Tensor
<
T
>
out_khwn
(
make_TensorDescriptor
(
out_khwn_desc
));
Tensor
<
T
>
out_khwn
(
make_TensorDescriptor
(
out_khwn_desc
));
...
...
driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
View file @
8a4b5978
...
@@ -36,7 +36,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
...
@@ -36,7 +36,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
constexpr
index_t
X
=
wei_kcyx_desc
.
GetLength
(
I3
);
constexpr
index_t
X
=
wei_kcyx_desc
.
GetLength
(
I3
);
// reorder weight
// reorder weight
auto
wei_cyxk_desc
=
make_
packed_
ConstantTensorDescriptor
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
auto
wei_cyxk_desc
=
make_ConstantTensorDescriptor
_default_rank_packed
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
ostream_ConstantTensorDescriptor
(
wei_cyxk_desc
,
std
::
cout
<<
"wei_cyxk_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_cyxk_desc
,
std
::
cout
<<
"wei_cyxk_desc: "
);
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
...
@@ -57,7 +57,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
...
@@ -57,7 +57,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
wei_cyxk_device_buf
.
ToDevice
(
wei_cyxk
.
mData
.
data
());
wei_cyxk_device_buf
.
ToDevice
(
wei_cyxk
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
#if
0
#if
1
// for 3x3, 34x34, v1r3, Pascal
// for 3x3, 34x34, v1r3, Pascal
constexpr
index_t
BlockSize
=
128
;
constexpr
index_t
BlockSize
=
128
;
...
@@ -92,7 +92,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
...
@@ -92,7 +92,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
constexpr
index_t
WeiBlockCopyDataPerRead_K
=
4
;
constexpr
index_t
WeiBlockCopyDataPerRead_K
=
4
;
constexpr
index_t
OutThreadCopyDataPerWrite_W
=
2
;
constexpr
index_t
OutThreadCopyDataPerWrite_W
=
2
;
#elif
0
#elif
1
// for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 32
// for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 32
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BlockSize
=
256
;
...
@@ -162,7 +162,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
...
@@ -162,7 +162,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
constexpr
index_t
WeiBlockCopyDataPerRead_K
=
4
;
constexpr
index_t
WeiBlockCopyDataPerRead_K
=
4
;
constexpr
index_t
OutThreadCopyDataPerWrite_W
=
2
;
constexpr
index_t
OutThreadCopyDataPerWrite_W
=
2
;
#elif
1
#elif
0
// for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 8
// for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 8
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BlockSize
=
256
;
...
...
driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
View file @
8a4b5978
...
@@ -35,7 +35,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
...
@@ -35,7 +35,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
constexpr
index_t
X
=
wei_kcyx_desc
.
GetLength
(
I3
);
constexpr
index_t
X
=
wei_kcyx_desc
.
GetLength
(
I3
);
// reorder weight
// reorder weight
auto
wei_cyxk_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
auto
wei_cyxk_desc
=
make_ConstantTensorDescriptor
_default_rank_packed
(
Sequence
<
C
,
Y
,
X
,
K
>
{});
ostream_ConstantTensorDescriptor
(
wei_cyxk_desc
,
std
::
cout
<<
"wei_cyxk_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_cyxk_desc
,
std
::
cout
<<
"wei_cyxk_desc: "
);
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
Tensor
<
T
>
wei_cyxk
(
make_TensorDescriptor
(
wei_cyxk_desc
));
...
@@ -56,37 +56,40 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
...
@@ -56,37 +56,40 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
wei_cyxk_device_buf
.
ToDevice
(
wei_cyxk
.
mData
.
data
());
wei_cyxk_device_buf
.
ToDevice
(
wei_cyxk
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
constexpr
index_t
N1
=
2
;
constexpr
index_t
N2
=
4
;
constexpr
index_t
B
=
(
N
*
Ho
*
Wo
)
/
(
N1
*
N2
);
#if 1
#if 1
// for 3x3, 28x28, v3
, Pascal
// for 3x3, 28x28, v3
constexpr
index_t
BlockSize
=
128
;
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BPerBlock
=
16
;
constexpr
index_t
BPerBlock
=
16
;
constexpr
index_t
KPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
128
;
constexpr
index_t
CPerBlock
=
8
;
constexpr
index_t
CPerBlock
=
8
;
constexpr
index_t
BPerThread
=
1
;
constexpr
index_t
KPerThread
=
8
;
constexpr
index_t
GemmMPerThreadSubC
=
4
;
constexpr
index_t
GemmMPerThreadSubC
=
4
;
constexpr
index_t
GemmNPerThreadSubC
=
4
;
constexpr
index_t
GemmNPerThreadSubC
=
4
;
constexpr
index_t
GemmMLevel0Cluster
=
4
;
constexpr
index_t
GemmMLevel0Cluster
=
4
;
constexpr
index_t
GemmNLevel0Cluster
=
2
;
constexpr
index_t
GemmNLevel0Cluster
=
4
;
constexpr
index_t
GemmMLevel1Cluster
=
4
;
constexpr
index_t
GemmMLevel1Cluster
=
4
;
constexpr
index_t
GemmNLevel1Cluster
=
2
;
constexpr
index_t
GemmNLevel1Cluster
=
4
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmDataPerReadA
=
4
;
constexpr
index_t
GemmDataPerReadA
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
using
InBlockReorderSrcSubLengths_NCHW
=
Sequence
<
4
,
1
,
1
,
1
>
;
using
InBlockCopySubLengths_N1_N2_C_B
=
Sequence
<
1
,
4
,
1
,
1
>
;
using
InBlockReorderSrcClusterLengths_NCHW
=
Sequence
<
4
,
8
,
2
,
2
>
;
using
InBlockCopyClusterLengths_N1_N2_C_B
=
Sequence
<
2
,
1
,
8
,
16
>
;
using
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
=
Sequence
<
1
,
2
,
0
,
3
>
;
constexpr
index_t
InBlockCopySrcDataPerRead_B
=
1
;
constexpr
index_t
InBlockCopyDstDataPerWrite_N2
=
4
;
constexpr
index_t
WeiBlockCopyDataPer
Read
_K
=
4
;
constexpr
index_t
WeiBlockCopyDataPer
Access
_K
=
4
;
#endif
#endif
constexpr
index_t
GridSize
=
constexpr
index_t
GridSize
=
((
N
+
NPerBlock
-
1
)
/
NPerBlock
)
*
((
K
+
KPerBlock
-
1
)
/
KPerBlock
)
*
((
B
+
BPerBlock
-
1
)
/
BPerBlock
)
*
((
K
+
KPerBlock
-
1
)
/
KPerBlock
);
((
Ho
+
HoPerBlock
-
1
)
/
HoPerBlock
)
*
((
Wo
+
WoPerBlock
-
1
)
/
WoPerBlock
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
...
@@ -102,15 +105,11 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
...
@@ -102,15 +105,11 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
decltype
(
in_nchw_desc
),
decltype
(
in_nchw_desc
),
decltype
(
wei_cyxk_desc
),
decltype
(
wei_cyxk_desc
),
decltype
(
out_nkhw_desc
),
decltype
(
out_nkhw_desc
),
N
PerBlock
,
B
PerBlock
,
KPerBlock
,
KPerBlock
,
CPerBlock
,
CPerBlock
,
HoPerBlock
,
N1
,
WoPerBlock
,
N2
,
NPerThread
,
KPerThread
,
HoPerThread
,
WoPerThread
,
GemmMPerThreadSubC
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
,
GemmNPerThreadSubC
,
GemmMLevel0Cluster
,
GemmMLevel0Cluster
,
...
@@ -120,14 +119,11 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
...
@@ -120,14 +119,11 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
GemmKPerThreadLoop
,
GemmKPerThreadLoop
,
GemmDataPerReadA
,
GemmDataPerReadA
,
GemmDataPerReadB
,
GemmDataPerReadB
,
InBlockReorderSrcSubLengths_NCHW
,
InBlockCopySubLengths_N1_N2_C_B
,
InBlockReorderSrcClusterLengths_NCHW
,
InBlockCopyClusterLengths_N1_N2_C_B
,
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
,
InBlockCopySrcDataPerRead_B
,
InBlockReorderDataPerRead_W
,
InBlockCopyDstDataPerWrite_N2
,
InBlockReorderDataPerWrite_N
,
WeiBlockCopyDataPerAccess_K
>
{};
WeiBlockCopyClusterLengths
,
WeiBlockCopyDataPerRead_K
,
OutThreadCopyDataPerWrite_W
>
{};
float
time
=
launch_kernel
(
run_gridwise_convolution
<
decltype
(
gridwise_conv
),
T
>
,
float
time
=
launch_kernel
(
run_gridwise_convolution
<
decltype
(
gridwise_conv
),
T
>
,
dim3
(
GridSize
),
dim3
(
GridSize
),
...
...
driver/driver.hip.cpp
View file @
8a4b5978
...
@@ -13,7 +13,7 @@
...
@@ -13,7 +13,7 @@
#include "device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
//
#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
struct
GeneratorTensor_1
struct
GeneratorTensor_1
{
{
...
@@ -548,8 +548,8 @@ int main(int argc, char* argv[])
...
@@ -548,8 +548,8 @@ int main(int argc, char* argv[])
auto
lower_pads
=
Sequence
<
HPad
,
WPad
>
{};
auto
lower_pads
=
Sequence
<
HPad
,
WPad
>
{};
auto
upper_pads
=
Sequence
<
HPad
,
WPad
>
{};
auto
upper_pads
=
Sequence
<
HPad
,
WPad
>
{};
auto
in_nchw_desc
=
make_
packed_
ConstantTensorDescriptor
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
auto
in_nchw_desc
=
make_ConstantTensorDescriptor
_default_rank_packed
(
Sequence
<
N
,
C
,
HI
,
WI
>
{});
auto
wei_kcyx_desc
=
make_
packed_
ConstantTensorDescriptor
(
Sequence
<
K
,
C
,
Y
,
X
>
{});
auto
wei_kcyx_desc
=
make_ConstantTensorDescriptor
_default_rank_packed
(
Sequence
<
K
,
C
,
Y
,
X
>
{});
auto
out_nkhw_desc
=
get_convolution_with_padding_output_default_4d_tensor_descriptor
(
auto
out_nkhw_desc
=
get_convolution_with_padding_output_default_4d_tensor_descriptor
(
in_nchw_desc
,
wei_kcyx_desc
,
lower_pads
,
upper_pads
);
in_nchw_desc
,
wei_kcyx_desc
,
lower_pads
,
upper_pads
);
...
@@ -612,11 +612,11 @@ int main(int argc, char* argv[])
...
@@ -612,11 +612,11 @@ int main(int argc, char* argv[])
device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
#elif 0
#elif 0
device_convolution_implicit_gemm_v1_nchw_cyxk_khwn
device_convolution_implicit_gemm_v1_nchw_cyxk_khwn
#elif
1
#elif
0
device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
#elif 0
#elif 0
device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
device_convolution_implicit_gemm_v2_chwn_cyxk_khwn
#elif
0
#elif
1
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
#endif
#endif
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
...
...
src/include/Array.hip.hpp
View file @
8a4b5978
...
@@ -12,7 +12,7 @@ struct Array
...
@@ -12,7 +12,7 @@ struct Array
index_t
mData
[
nSize
];
index_t
mData
[
nSize
];
template
<
class
...
Xs
>
template
<
class
...
Xs
>
__host__
__device__
Array
(
Xs
...
xs
)
:
mData
{
static_cast
<
TData
>
(
xs
)...}
__host__
__device__
constexpr
Array
(
Xs
...
xs
)
:
mData
{
static_cast
<
TData
>
(
xs
)...}
{
{
}
}
...
@@ -37,6 +37,25 @@ struct Array
...
@@ -37,6 +37,25 @@ struct Array
}
}
};
};
template
<
index_t
...
Is
>
__host__
__device__
constexpr
auto
sequence2array
(
Sequence
<
Is
...
>
)
{
return
Array
<
index_t
,
sizeof
...(
Is
)
>
{
Is
...};
}
template
<
class
TData
,
index_t
NSize
>
__host__
__device__
constexpr
auto
make_zero_array
()
{
Array
<
TData
,
NSize
>
a
;
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
a
[
i
]
=
static_cast
<
TData
>
(
0
);
});
return
a
;
}
template
<
class
TData
,
index_t
NSize
,
index_t
...
IRs
>
template
<
class
TData
,
index_t
NSize
,
index_t
...
IRs
>
__host__
__device__
auto
reorder_array_given_new2old
(
const
Array
<
TData
,
NSize
>&
old_array
,
__host__
__device__
auto
reorder_array_given_new2old
(
const
Array
<
TData
,
NSize
>&
old_array
,
Sequence
<
IRs
...
>
new2old
)
Sequence
<
IRs
...
>
new2old
)
...
@@ -80,15 +99,14 @@ __host__ __device__ auto extract_array(const Array<TData, NSize>& old_array, Ext
...
@@ -80,15 +99,14 @@ __host__ __device__ auto extract_array(const Array<TData, NSize>& old_array, Ext
static_for
<
0
,
new_size
,
1
>
{}([
&
](
auto
I
)
{
static_for
<
0
,
new_size
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
constexpr
index_t
i
=
I
.
Get
();
new_array
[
i
]
=
old_array
[
ExtractSeq
{}.
Get
(
I
)];
new_array
[
i
]
=
old_array
[
ExtractSeq
::
Get
(
I
)];
});
});
return
new_array
;
return
new_array
;
}
}
template
<
class
TData
,
index_t
NSize
>
template
<
class
TData
,
index_t
NSize
>
__host__
__device__
constexpr
auto
operator
+
(
const
Array
<
TData
,
NSize
>&
a
,
__host__
__device__
constexpr
auto
operator
+
(
Array
<
TData
,
NSize
>
a
,
Array
<
TData
,
NSize
>
b
)
const
Array
<
TData
,
NSize
>&
b
)
{
{
Array
<
TData
,
NSize
>
result
;
Array
<
TData
,
NSize
>
result
;
...
@@ -99,3 +117,20 @@ __host__ __device__ constexpr auto operator+(const Array<TData, NSize>& a,
...
@@ -99,3 +117,20 @@ __host__ __device__ constexpr auto operator+(const Array<TData, NSize>& a,
return
result
;
return
result
;
}
}
// Array = Array * Sequence
template
<
class
TData
,
index_t
NSize
,
index_t
...
Is
>
__host__
__device__
constexpr
auto
operator
*
(
Array
<
TData
,
NSize
>
a
,
Sequence
<
Is
...
>
b
)
{
static_assert
(
sizeof
...(
Is
)
==
NSize
,
"wrong! size not the same"
);
Array
<
TData
,
NSize
>
result
;
static_for
<
0
,
NSize
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
i
=
I
.
Get
();
result
[
i
]
=
a
[
i
]
+
b
.
Get
(
I
);
});
return
result
;
}
src/include/ConstantMatrixDescriptor.hip.hpp
View file @
8a4b5978
...
@@ -9,26 +9,26 @@ struct ConstantMatrixDescriptor
...
@@ -9,26 +9,26 @@ struct ConstantMatrixDescriptor
static_assert
(
NCol_
<=
RowStride_
,
"wrong! NCol > RowStride!"
);
static_assert
(
NCol_
<=
RowStride_
,
"wrong! NCol > RowStride!"
);
}
}
__host__
__device__
constexpr
index_t
NRow
()
const
{
return
NRow_
;
}
__host__
__device__
static
constexpr
index_t
NRow
()
{
return
NRow_
;
}
__host__
__device__
constexpr
index_t
NCol
()
const
{
return
NCol_
;
}
__host__
__device__
static
constexpr
index_t
NCol
()
{
return
NCol_
;
}
__host__
__device__
constexpr
index_t
RowStride
()
const
{
return
RowStride_
;
}
__host__
__device__
static
constexpr
index_t
RowStride
()
{
return
RowStride_
;
}
__host__
__device__
constexpr
auto
GetLengths
()
const
{
return
Sequence
<
NRow_
,
NCol_
>
{};
}
__host__
__device__
static
constexpr
auto
GetLengths
()
{
return
Sequence
<
NRow_
,
NCol_
>
{};
}
__host__
__device__
constexpr
index_t
GetElementSize
()
const
{
return
NRow_
*
NCol_
;
}
__host__
__device__
static
constexpr
index_t
GetElementSize
()
{
return
NRow_
*
NCol_
;
}
__host__
__device__
constexpr
index_t
GetElementSpace
()
const
{
return
NRow_
*
RowStride_
;
}
__host__
__device__
static
constexpr
index_t
GetElementSpace
()
{
return
NRow_
*
RowStride_
;
}
__host__
__device__
index_t
GetOffsetFromMultiIndex
(
index_t
irow
,
index_t
icol
)
const
__host__
__device__
static
index_t
GetOffsetFromMultiIndex
(
index_t
irow
,
index_t
icol
)
{
{
return
irow
*
RowStride_
+
icol
;
return
irow
*
RowStride_
+
icol
;
}
}
template
<
index_t
SubNRow
,
index_t
SubNCol
>
template
<
index_t
SubNRow
,
index_t
SubNCol
>
__host__
__device__
constexpr
auto
MakeSubMatrixDescriptor
(
Number
<
SubNRow
>
,
__host__
__device__
static
constexpr
auto
MakeSubMatrixDescriptor
(
Number
<
SubNRow
>
,
Number
<
SubNCol
>
)
const
Number
<
SubNCol
>
)
{
{
return
ConstantMatrixDescriptor
<
SubNRow
,
SubNCol
,
RowStride_
>
{};
return
ConstantMatrixDescriptor
<
SubNRow
,
SubNCol
,
RowStride_
>
{};
}
}
...
...
src/include/ConstantMergedTensorDescriptor.hip.hpp
View file @
8a4b5978
...
@@ -11,8 +11,8 @@ struct ConstantMergedTensorDescriptor
...
@@ -11,8 +11,8 @@ struct ConstantMergedTensorDescriptor
{
{
static
constexpr
auto
mOriginalDimMergeSeqs
=
std
::
tuple
<
OriginalDimMergeSeqs
...
>
{};
static
constexpr
auto
mOriginalDimMergeSeqs
=
std
::
tuple
<
OriginalDimMergeSeqs
...
>
{};
static
constexpr
index_t
nDim
=
s
td
::
tuple_size
<
m
OriginalDimMergeSeqs
>::
value
;
static
constexpr
index_t
nDim
=
s
izeof
...(
OriginalDimMergeSeqs
)
;
static
constexpr
index_t
nOriginalDim
=
OriginalDesc
::
GetNumOfDimension
();
static
constexpr
index_t
nOriginalDim
=
Original
Tensor
Desc
::
GetNumOfDimension
();
__host__
__device__
constexpr
ConstantMergedTensorDescriptor
()
__host__
__device__
constexpr
ConstantMergedTensorDescriptor
()
{
{
...
@@ -21,25 +21,28 @@ struct ConstantMergedTensorDescriptor
...
@@ -21,25 +21,28 @@ struct ConstantMergedTensorDescriptor
// TODO: check each of OriginalDimMergeSeqs contains at least 1, and at most
// TODO: check each of OriginalDimMergeSeqs contains at least 1, and at most
// OriginalTensorDesc::nDim number of dimensions
// OriginalTensorDesc::nDim number of dimensions
// TODO: check there is no duplication in OriginalDimMergeSeqs
// TODO: check OriginalDimMergeSeqs contains all original dimensions
// TODO: check OriginalDimMergeSeqs contains all original dimensions
// TODO: check there is no duplication in OriginalDimMergeSeqs
}
}
__host__
__device__
static
constexpr
index_t
GetNumOfDimension
()
{
return
nDim
;
}
__host__
__device__
static
constexpr
index_t
GetNumOfDimension
()
{
return
nDim
;
}
__host__
__device__
static
constexpr
index_t
GetNumOfOriginalDimension
()
{
return
nOriginalDim
}
__host__
__device__
static
constexpr
index_t
GetNumOfOriginalDimension
()
{
return
nOriginalDim
;
}
template
<
index_t
IDim
>
template
<
index_t
IDim
>
__host__
__device__
static
constexpr
bool
ContainMultipleOriginalDimensions
(
Number
<
IDim
>
)
__host__
__device__
static
constexpr
bool
ContainMultipleOriginalDimensions
(
Number
<
IDim
>
)
{
{
return
(
std
::
G
et
<
ID
IM
>
(
mOriginalDimMergeSeqs
).
GetSize
()
>
1
);
return
(
std
::
g
et
<
ID
im
>
(
mOriginalDimMergeSeqs
).
GetSize
()
>
1
);
}
}
template
<
index_t
IDim
>
template
<
index_t
IDim
>
__host__
__device__
static
constexpr
index_t
GetLength
(
Number
<
IDim
>
)
__host__
__device__
static
constexpr
index_t
GetLength
(
Number
<
IDim
>
)
{
{
constexpr
auto
original_dims_partial
=
std
::
G
et
<
IDim
>
(
mOriginalDimMergeSeqs
);
constexpr
auto
original_dims_partial
=
std
::
g
et
<
IDim
>
(
mOriginalDimMergeSeqs
);
return
OriginalTensorDesc
::
Extract
(
original_dims_partial
).
GetElementSize
();
return
OriginalTensorDesc
::
Extract
(
original_dims_partial
).
GetElementSize
();
}
}
...
@@ -50,14 +53,14 @@ struct ConstantMergedTensorDescriptor
...
@@ -50,14 +53,14 @@ struct ConstantMergedTensorDescriptor
static_assert
(
!
ContainMultipleOriginalDimensions
(
Number
<
IDim
>
{}),
static_assert
(
!
ContainMultipleOriginalDimensions
(
Number
<
IDim
>
{}),
"wrong! stride of a merged dimension is undefined"
);
"wrong! stride of a merged dimension is undefined"
);
constexpr
auto
idim_original
=
std
::
G
et
<
IDim
>
(
mOriginalDimMergeSeqs
).
Front
();
constexpr
auto
idim_original
=
std
::
g
et
<
IDim
>
(
mOriginalDimMergeSeqs
).
Front
();
return
OriginalTensorDesc
::
GetStride
(
Number
<
idim_original
>
{});
return
OriginalTensorDesc
::
GetStride
(
Number
<
idim_original
>
{});
}
}
__host__
__device__
static
constexpr
auto
GetLengths
()
__host__
__device__
static
constexpr
auto
GetLengths
()
{
{
return
Sequence
<
OriginalTensorDesc
::
Extract
(
OriginalDimMergeSeqs
).
GetElementSize
()...
>
{};
return
Sequence
<
OriginalTensorDesc
::
Extract
(
OriginalDimMergeSeqs
{}
).
GetElementSize
()...
>
{};
}
}
__host__
__device__
static
constexpr
index_t
GetElementSize
()
__host__
__device__
static
constexpr
index_t
GetElementSize
()
...
@@ -75,17 +78,16 @@ struct ConstantMergedTensorDescriptor
...
@@ -75,17 +78,16 @@ struct ConstantMergedTensorDescriptor
constexpr
auto
original_dims_partial
=
std
::
get
<
idim
>
(
mOriginalDimMergeSeqs
);
constexpr
auto
original_dims_partial
=
std
::
get
<
idim
>
(
mOriginalDimMergeSeqs
);
// get partial original-multi-id corresponding to this merged dimension
// get partial original-multi-id corresponding to this merged dimension
const
expr
auto
original_multi_id_partial
=
const
auto
original_multi_id_partial
=
OriginalTensorDesc
::
Extract
(
original_dims_partial
)
OriginalTensorDesc
::
Extract
(
original_dims_partial
)
.
GetMultiIndexFrom1dIndex
(
multi_id
[
idim
]);
.
GetMultiIndexFrom1dIndex
(
multi_id
[
idim
]);
// make sure compiler unroll this loop and propagate all the constants
static_for
<
0
,
original_dims_partial
.
GetSize
(),
1
>
{}([
&
](
auto
I_
)
{
for
(
index_t
i
=
0
;
i
<
original_dims_partial
.
GetSize
();
++
i
)
constexpr
auto
I
=
decltype
(
I_
){};
{
constexpr
index_t
idim_original
=
original_dims_partial
.
Get
(
I
);
index_t
idim_original
=
original_dims_partial
[
i
];
original_multi_id
[
idim_original
]
=
original_multi_id_partial
[
i
]
original_multi_id
[
idim_original
]
=
original_multi_id_partial
[
I
.
Get
()];
}
}
);
});
});
return
original_multi_id
;
return
original_multi_id
;
...
@@ -95,10 +97,10 @@ struct ConstantMergedTensorDescriptor
...
@@ -95,10 +97,10 @@ struct ConstantMergedTensorDescriptor
{
{
const
auto
original_multi_id
=
GetOriginalMultiIndexFromMultiIndex
(
multi_id
);
const
auto
original_multi_id
=
GetOriginalMultiIndexFromMultiIndex
(
multi_id
);
return
OriginalTensorDesc
::
GetOffsetFromMultiIndex
(
orginal_multi_id
);
return
OriginalTensorDesc
::
GetOffsetFromMultiIndex
(
or
i
ginal_multi_id
);
}
}
template
<
index_t
...
Is
>
template
<
class
...
Is
>
__host__
__device__
static
index_t
GetOffsetFromMultiIndex
(
Is
...
is
)
__host__
__device__
static
index_t
GetOffsetFromMultiIndex
(
Is
...
is
)
{
{
return
GetOffsetFromMultiIndex
(
Array
<
index_t
,
nDim
>
{
is
...});
return
GetOffsetFromMultiIndex
(
Array
<
index_t
,
nDim
>
{
is
...});
...
@@ -106,14 +108,15 @@ struct ConstantMergedTensorDescriptor
...
@@ -106,14 +108,15 @@ struct ConstantMergedTensorDescriptor
__host__
__device__
static
Array
<
index_t
,
nDim
>
GetMultiIndexFrom1dIndex
(
index_t
id
)
__host__
__device__
static
Array
<
index_t
,
nDim
>
GetMultiIndexFrom1dIndex
(
index_t
id
)
{
{
constexpr
auto
dummy_desc
=
make_
packed_
ConstantTensorDescriptor
(
GetLengths
());
constexpr
auto
dummy_desc
=
make_ConstantTensorDescriptor
_default_rank_packed
(
GetLengths
());
return
dummy_desc
.
GetMultiIndexFrom1dIndex
(
id
);
return
dummy_desc
.
GetMultiIndexFrom1dIndex
(
id
);
}
}
};
};
template
<
class
OriginalTensorDesc
,
class
...
OriginalDimMergeSeqs
>
template
<
class
OriginalTensorDesc
,
class
...
OriginalDimMergeSeqs
>
constexpr
auto
make_ConstantMergedTensorDescriptor
(
OriginalTensorDesc
,
OriginalDimMergeSeqs
...)
__host__
__device__
constexpr
auto
make_ConstantMergedTensorDescriptor
(
OriginalTensorDesc
,
OriginalDimMergeSeqs
...)
{
{
return
ConstantMergedTensorDescriptor
<
OriginalTensorDesc
,
OriginalDimMergeSeqs
...
>
{};
return
ConstantMergedTensorDescriptor
<
OriginalTensorDesc
,
OriginalDimMergeSeqs
...
>
{};
}
}
src/include/ConstantTensorDescriptor.hip.hpp
View file @
8a4b5978
...
@@ -2,20 +2,20 @@
...
@@ -2,20 +2,20 @@
#include "common.hip.hpp"
#include "common.hip.hpp"
template
<
class
Lengths
>
template
<
class
Lengths
>
__host__
__device__
constexpr
auto
calculate_
packed_
tensor_strides
(
Lengths
)
__host__
__device__
constexpr
auto
calculate_tensor_strides
_default_rank_packed
(
Lengths
)
{
{
return
reverse_inclusive_scan_sequence
(
Lengths
{}.
PopFront
(),
std
::
multiplies
<
index_t
>
{})
return
reverse_inclusive_scan_sequence
(
Lengths
{}.
PopFront
(),
std
::
multiplies
<
index_t
>
{})
.
PushBack
(
Number
<
1
>
{});
.
PushBack
(
Number
<
1
>
{});
}
}
template
<
class
Lengths
,
index_t
Align
>
template
<
class
Lengths
,
index_t
Align
>
__host__
__device__
constexpr
auto
__host__
__device__
constexpr
auto
calculate_tensor_strides_default_rank_aligned
(
Lengths
,
calculate_rank_tensor_default_strides_with_alignment
(
Lengths
,
Number
<
Align
>
)
Number
<
Align
>
)
{
{
constexpr
index_t
L_back_align
=
constexpr
index_t
L_back_align
=
Align
*
mod_conv
::
integer_divide_ceiler
<
index_t
>
{}(
Lengths
{}.
Back
(),
Align
);
Align
*
mod_conv
::
integer_divide_ceiler
<
index_t
>
{}(
Lengths
{}.
Back
(),
Align
);
return
calculate_
packed_
tensor_strides
(
return
calculate_tensor_strides
_default_rank_packed
(
Lengths
{}.
Modify
(
Number
<
Lengths
{}.
GetSize
()
-
1
>
{},
Number
<
L_back_align
>
{}));
Lengths
{}.
Modify
(
Number
<
Lengths
{}.
GetSize
()
-
1
>
{},
Number
<
L_back_align
>
{}));
}
}
...
@@ -66,6 +66,12 @@ struct ConstantTensorDescriptor
...
@@ -66,6 +66,12 @@ struct ConstantTensorDescriptor
return
MemoryRanks
{}.
Get
(
Number
<
I
>
{});
return
MemoryRanks
{}.
Get
(
Number
<
I
>
{});
}
}
template
<
class
T
>
__host__
__device__
static
constexpr
bool
ContainMultipleOriginalDimensions
(
T
)
{
return
false
;
}
__host__
__device__
static
constexpr
index_t
GetElementSize
()
__host__
__device__
static
constexpr
index_t
GetElementSize
()
{
{
return
accumulate_on_sequence
(
Lengths
{},
std
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
return
accumulate_on_sequence
(
Lengths
{},
std
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
...
@@ -146,7 +152,7 @@ struct ConstantTensorDescriptor
...
@@ -146,7 +152,7 @@ struct ConstantTensorDescriptor
{
{
Array
<
index_t
,
nDim
>
multi_id
;
Array
<
index_t
,
nDim
>
multi_id
;
constexpr
auto
dummy_strides
=
calculate_
packed_
tensor_strides
(
GetLengths
());
constexpr
auto
dummy_strides
=
calculate_tensor_strides
_default_rank_packed
(
GetLengths
());
// calculate index in each of the dimensions in the order of their dimension (not rank)
// calculate index in each of the dimensions in the order of their dimension (not rank)
static_for
<
0
,
nDim
-
1
,
1
>
{}([
&
](
auto
IDim
)
{
static_for
<
0
,
nDim
-
1
,
1
>
{}([
&
](
auto
IDim
)
{
...
@@ -181,6 +187,12 @@ struct ConstantTensorDescriptor
...
@@ -181,6 +187,12 @@ struct ConstantTensorDescriptor
return
ConstantTensorDescriptor
<
extract_lengths
,
extract_strides
,
new_ranks
>
{};
return
ConstantTensorDescriptor
<
extract_lengths
,
extract_strides
,
new_ranks
>
{};
}
}
template
<
index_t
...
IDims
>
__host__
__device__
static
constexpr
auto
Extract
(
Sequence
<
IDims
...
>
)
{
return
Extract
(
Number
<
IDims
>
{}...);
}
template
<
index_t
IDim
,
index_t
SliceLen
>
template
<
index_t
IDim
,
index_t
SliceLen
>
__host__
__device__
static
constexpr
auto
Slice
(
Number
<
IDim
>
,
Number
<
SliceLen
>
)
__host__
__device__
static
constexpr
auto
Slice
(
Number
<
IDim
>
,
Number
<
SliceLen
>
)
{
{
...
@@ -271,9 +283,11 @@ struct ConstantTensorDescriptor
...
@@ -271,9 +283,11 @@ struct ConstantTensorDescriptor
FirstUnfoldDim
<=
LastUnfoldDim
,
FirstUnfoldDim
<=
LastUnfoldDim
,
"wrong! should have FirstUnfoldDim <= LastUnfoldDim!"
);
"wrong! should have FirstUnfoldDim <= LastUnfoldDim!"
);
#if 0 // cannot compile: compiler complain about constexpr
// dimensions to be unfold need to be in descending order (w.r.t. strides), and need to be
// dimensions to be unfold need to be in descending order (w.r.t. strides), and need to be
// packed in memory, otherwise, unfolding is invalid
// packed in memory, otherwise, unfolding is invalid
static_for
<
FirstUnfoldDim
,
LastUnfoldDim
,
1
>
{}([
&
](
auto
IDim
)
{
static_for<FirstUnfoldDim, LastUnfoldDim, 1>{}([&](auto IDim_) {
constexpr auto IDim = decltype(IDim_){};
constexpr auto IDim_p1 = IDim + Number<1>{};
constexpr auto IDim_p1 = IDim + Number<1>{};
// check stride
// check stride
...
@@ -285,11 +299,12 @@ struct ConstantTensorDescriptor
...
@@ -285,11 +299,12 @@ struct ConstantTensorDescriptor
static_assert(GetStride(IDim_p1) * GetLength(IDim_p1) == GetStride(IDim),
static_assert(GetStride(IDim_p1) * GetLength(IDim_p1) == GetStride(IDim),
"wrong! dimensions to be unfolded need to be packed");
"wrong! dimensions to be unfolded need to be packed");
// check
t
ranks
// check ranks
static_assert(GetMemoryRank(IDim_p1) == GetMemoryRank(IDim) + 1,
static_assert(GetMemoryRank(IDim_p1) == GetMemoryRank(IDim) + 1,
"wrong! ranks of dimensions to be unfolded need to be in increasing and "
"wrong! ranks of dimensions to be unfolded need to be in increasing and "
"continuous ranks");
"continuous ranks");
});
});
#endif
// left and right
// left and right
constexpr
auto
left
=
typename
arithmetic_sequence_gen
<
0
,
FirstUnfoldDim
,
1
>::
SeqType
{};
constexpr
auto
left
=
typename
arithmetic_sequence_gen
<
0
,
FirstUnfoldDim
,
1
>::
SeqType
{};
...
@@ -308,9 +323,9 @@ struct ConstantTensorDescriptor
...
@@ -308,9 +323,9 @@ struct ConstantTensorDescriptor
// decrease the ranks that are larger than the rank of LastUnfoldDim
// decrease the ranks that are larger than the rank of LastUnfoldDim
constexpr
auto
tmp_ranks
=
constexpr
auto
tmp_ranks
=
transform_sequences
(
GetMemoryRanks
(
),
transform_sequences
(
f_unfold_impl
<
GetMemoryRank
(
Number
<
LastUnfoldDim
>
{}
),
f_u
nfold
_
im
pl
<
GetMemoryRank
(
Number
<
La
stUnfoldDim
>
{}
)
,
LastU
nfold
D
im
-
Fir
stUnfoldDim
+
1
>
{},
LastUnfoldDim
-
FirstUnfoldDim
+
1
>
{}
);
GetMemoryRanks
()
);
// new lengths, strides and ranks
// new lengths, strides and ranks
constexpr
auto
new_lengths
=
GetLengths
()
constexpr
auto
new_lengths
=
GetLengths
()
...
@@ -354,26 +369,26 @@ struct ConstantTensorDescriptor
...
@@ -354,26 +369,26 @@ struct ConstantTensorDescriptor
};
};
template
<
class
Lengths
>
template
<
class
Lengths
>
__host__
__device__
constexpr
auto
make_
packed_
ConstantTensorDescriptor
(
Lengths
)
__host__
__device__
constexpr
auto
make_ConstantTensorDescriptor
_default_rank_packed
(
Lengths
)
{
{
using
Strides
=
decltype
(
calculate_
packed_
tensor_strides
(
Lengths
{}));
using
Strides
=
decltype
(
calculate_tensor_strides
_default_rank_packed
(
Lengths
{}));
using
MemoryRanks
=
typename
arithmetic_sequence_gen
<
0
,
Lengths
::
GetSize
(),
1
>::
SeqType
;
using
MemoryRanks
=
typename
arithmetic_sequence_gen
<
0
,
Lengths
::
GetSize
(),
1
>::
SeqType
;
return
ConstantTensorDescriptor
<
Lengths
,
Strides
,
MemoryRanks
>
{};
return
ConstantTensorDescriptor
<
Lengths
,
Strides
,
MemoryRanks
>
{};
}
}
template
<
class
Lengths
,
class
Strides
>
template
<
class
Lengths
,
class
Strides
>
__host__
__device__
constexpr
auto
make_
ranked_
ConstantTensorDescriptor
(
Lengths
,
Strides
)
__host__
__device__
constexpr
auto
make_ConstantTensorDescriptor
_default_rank
(
Lengths
,
Strides
)
{
{
using
MemoryRanks
=
typename
arithmetic_sequence_gen
<
0
,
Lengths
::
GetSize
(),
1
>::
SeqType
;
using
MemoryRanks
=
typename
arithmetic_sequence_gen
<
0
,
Lengths
::
GetSize
(),
1
>::
SeqType
;
return
ConstantTensorDescriptor
<
Lengths
,
Strides
,
MemoryRanks
>
{};
return
ConstantTensorDescriptor
<
Lengths
,
Strides
,
MemoryRanks
>
{};
}
}
template
<
class
Lengths
,
index_t
Align
>
template
<
class
Lengths
,
index_t
Align
>
__host__
__device__
constexpr
auto
__host__
__device__
constexpr
auto
make_ConstantTensorDescriptor_default_rank_aligned
(
Lengths
,
make_ranked_ConstantTensorDescriptor_with_alignment
(
Lengths
,
Number
<
Align
>
)
Number
<
Align
>
)
{
{
using
Strides
=
using
Strides
=
decltype
(
calculate_
rank_
tensor_
default_strides_with
_align
ment
(
Lengths
{},
Number
<
Align
>
{}));
decltype
(
calculate_tensor_
strides_default_rank
_align
ed
(
Lengths
{},
Number
<
Align
>
{}));
using
MemoryRanks
=
typename
arithmetic_sequence_gen
<
0
,
Lengths
::
GetSize
(),
1
>::
SeqType
;
using
MemoryRanks
=
typename
arithmetic_sequence_gen
<
0
,
Lengths
::
GetSize
(),
1
>::
SeqType
;
return
ConstantTensorDescriptor
<
Lengths
,
Strides
,
MemoryRanks
>
{};
return
ConstantTensorDescriptor
<
Lengths
,
Strides
,
MemoryRanks
>
{};
}
}
...
...
src/include/Sequence.hip.hpp
View file @
8a4b5978
#pragma once
#pragma once
#include "
constant_integral
.hip.hpp"
#include "
integral_constant
.hip.hpp"
#include "functional.hip.hpp"
#include "functional.hip.hpp"
template
<
index_t
...
Is
>
template
<
index_t
...
Is
>
...
@@ -21,12 +21,6 @@ struct Sequence
...
@@ -21,12 +21,6 @@ struct Sequence
return
mData
[
I
];
return
mData
[
I
];
}
}
__host__
__device__
index_t
operator
[](
index_t
i
)
const
{
const
index_t
mData
[
mSize
+
1
]
=
{
Is
...,
0
};
return
mData
[
i
];
}
template
<
index_t
...
IRs
>
template
<
index_t
...
IRs
>
__host__
__device__
static
constexpr
auto
ReorderGivenNew2Old
(
Sequence
<
IRs
...
>
/*new2old*/
)
__host__
__device__
static
constexpr
auto
ReorderGivenNew2Old
(
Sequence
<
IRs
...
>
/*new2old*/
)
{
{
...
@@ -164,6 +158,12 @@ struct sequence_reverse_inclusive_scan<Sequence<I>, Reduce>
...
@@ -164,6 +158,12 @@ struct sequence_reverse_inclusive_scan<Sequence<I>, Reduce>
using
SeqType
=
Sequence
<
I
>
;
using
SeqType
=
Sequence
<
I
>
;
};
};
template
<
class
Reduce
>
struct
sequence_reverse_inclusive_scan
<
Sequence
<>
,
Reduce
>
{
using
SeqType
=
Sequence
<>
;
};
template
<
class
,
class
>
template
<
class
,
class
>
struct
sequence_extract
;
struct
sequence_extract
;
...
...
src/include/blockwise_2d_tensor_op.hip.hpp
View file @
8a4b5978
...
@@ -457,7 +457,8 @@ struct Blockwise2dTensorCopy3
...
@@ -457,7 +457,8 @@ struct Blockwise2dTensorCopy3
index_t
mSrcMyThreadOffset
;
index_t
mSrcMyThreadOffset
;
index_t
mDstMyThreadOffset
;
index_t
mDstMyThreadOffset
;
__device__
Blockwise2dTensorCopy3
()
__device__
Blockwise2dTensorCopy3
(
Array
<
index_t
,
2
>
src_block_data_multi_id_begin
,
Array
<
index_t
,
2
>
dst_block_data_multi_id_begin
)
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
@@ -499,10 +500,13 @@ struct Blockwise2dTensorCopy3
...
@@ -499,10 +500,13 @@ struct Blockwise2dTensorCopy3
const
index_t
thread_id_d0
=
get_thread_local_1d_id
()
/
thread_per_d1
;
const
index_t
thread_id_d0
=
get_thread_local_1d_id
()
/
thread_per_d1
;
const
index_t
thread_id_d1
=
get_thread_local_1d_id
()
-
thread_id_d0
*
thread_per_d1
;
const
index_t
thread_id_d1
=
get_thread_local_1d_id
()
-
thread_id_d0
*
thread_per_d1
;
mSrcMyThreadOffset
=
mSrcMyThreadOffset
=
SrcDesc
{}.
GetOffsetFromMultiIndex
(
SrcDesc
{}.
GetOffsetFromMultiIndex
(
thread_id_d0
,
thread_id_d1
*
DataPerRead
);
src_block_data_multi_id_begin
+
mDstMyThreadOffset
=
Array
<
index_t
,
2
>
{
thread_id_d0
,
thread_id_d1
*
DataPerRead
});
DstDesc
{}.
GetOffsetFromMultiIndex
(
thread_id_d0
,
thread_id_d1
*
DataPerRead
);
mDstMyThreadOffset
=
DstDesc
{}.
GetOffsetFromMultiIndex
(
dst_block_data_multi_id_begin
+
Array
<
index_t
,
2
>
{
thread_id_d0
,
thread_id_d1
*
DataPerRead
});
}
}
__device__
void
Run
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
__device__
void
Run
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
...
...
src/include/blockwise_4d_tensor_op.hip.hpp
View file @
8a4b5978
...
@@ -13,7 +13,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
...
@@ -13,7 +13,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
desc
=
make_
packed_
ConstantTensorDescriptor
(
dst_desc
.
GetLengths
());
constexpr
auto
desc
=
make_ConstantTensorDescriptor
_default_rank_packed
(
dst_desc
.
GetLengths
());
#if 0
#if 0
if(get_thread_local_1d_id() == 0)
if(get_thread_local_1d_id() == 0)
...
@@ -108,7 +108,7 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
...
@@ -108,7 +108,7 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
ref_desc
=
make_
packed_
ConstantTensorDescriptor
(
SrcOpLengths
{});
constexpr
auto
ref_desc
=
make_ConstantTensorDescriptor
_default_rank_packed
(
SrcOpLengths
{});
constexpr
index_t
NLoop
=
ref_desc
.
GetElementSize
()
/
BlockSize
;
constexpr
index_t
NLoop
=
ref_desc
.
GetElementSize
()
/
BlockSize
;
...
@@ -259,7 +259,7 @@ struct Blockwise4dTensorCopy1
...
@@ -259,7 +259,7 @@ struct Blockwise4dTensorCopy1
constexpr
index_t
read_per_d3
=
mod_conv
::
integer_divide_ceil
(
L3
,
DataPerRead
);
constexpr
index_t
read_per_d3
=
mod_conv
::
integer_divide_ceil
(
L3
,
DataPerRead
);
constexpr
auto
ref_desc
=
constexpr
auto
ref_desc
=
make_
packed_
ConstantTensorDescriptor
(
Sequence
<
L0
,
L1
,
L2
,
read_per_d3
>
{});
make_ConstantTensorDescriptor
_default_rank_packed
(
Sequence
<
L0
,
L1
,
L2
,
read_per_d3
>
{});
constexpr
index_t
NLoop
=
ref_desc
.
GetElementSize
()
/
BlockSize
;
constexpr
index_t
NLoop
=
ref_desc
.
GetElementSize
()
/
BlockSize
;
...
@@ -336,7 +336,7 @@ struct BlockwiseChwnTensorCopyPadded
...
@@ -336,7 +336,7 @@ struct BlockwiseChwnTensorCopyPadded
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
ref_desc
=
make_
packed_
ConstantTensorDescriptor
(
DstOpLengths
{});
constexpr
auto
ref_desc
=
make_ConstantTensorDescriptor
_default_rank_packed
(
DstOpLengths
{});
constexpr
auto
h_global_pad_low
=
GlobalLowerPads
{}.
Get
(
I0
);
constexpr
auto
h_global_pad_low
=
GlobalLowerPads
{}.
Get
(
I0
);
constexpr
auto
w_global_pad_low
=
GlobalLowerPads
{}.
Get
(
I1
);
constexpr
auto
w_global_pad_low
=
GlobalLowerPads
{}.
Get
(
I1
);
...
@@ -510,7 +510,8 @@ struct Blockwise4dTensorCopy3
...
@@ -510,7 +510,8 @@ struct Blockwise4dTensorCopy3
}
}
}
}
constexpr
auto
thread_cluster_desc
=
make_packed_ConstantTensorDescriptor
(
ThreadPerDims
{});
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor_default_rank_packed
(
ThreadPerDims
{});
const
auto
thread_multi_id
=
const
auto
thread_multi_id
=
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
...
@@ -652,7 +653,7 @@ struct Blockwise4dTensorCopy3
...
@@ -652,7 +653,7 @@ struct Blockwise4dTensorCopy3
constexpr
index_t
nloop_d2
=
L2
/
thread_per_d2
;
constexpr
index_t
nloop_d2
=
L2
/
thread_per_d2
;
constexpr
index_t
nloop_d3
=
mod_conv
::
integer_divide_ceil
(
L3
,
thread_per_d3
*
DataPerRead
);
constexpr
index_t
nloop_d3
=
mod_conv
::
integer_divide_ceil
(
L3
,
thread_per_d3
*
DataPerRead
);
constexpr
auto
clipboard_desc
=
make_
packed_
ConstantTensorDescriptor
(
constexpr
auto
clipboard_desc
=
make_ConstantTensorDescriptor
_default_rank_packed
(
Sequence
<
nloop_d0
,
nloop_d1
,
nloop_d2
,
nloop_d3
*
DataPerRead
>
{});
Sequence
<
nloop_d0
,
nloop_d1
,
nloop_d2
,
nloop_d3
*
DataPerRead
>
{});
#pragma unroll
#pragma unroll
...
@@ -719,7 +720,7 @@ struct Blockwise4dTensorCopy3
...
@@ -719,7 +720,7 @@ struct Blockwise4dTensorCopy3
constexpr
index_t
nloop_d2
=
L2
/
thread_per_d2
;
constexpr
index_t
nloop_d2
=
L2
/
thread_per_d2
;
constexpr
index_t
nloop_d3
=
mod_conv
::
integer_divide_ceil
(
L3
,
thread_per_d3
*
DataPerRead
);
constexpr
index_t
nloop_d3
=
mod_conv
::
integer_divide_ceil
(
L3
,
thread_per_d3
*
DataPerRead
);
constexpr
auto
clipboard_desc
=
make_
packed_
ConstantTensorDescriptor
(
constexpr
auto
clipboard_desc
=
make_ConstantTensorDescriptor
_default_rank_packed
(
Sequence
<
nloop_d0
,
nloop_d1
,
nloop_d2
,
nloop_d3
*
DataPerRead
>
{});
Sequence
<
nloop_d0
,
nloop_d1
,
nloop_d2
,
nloop_d3
*
DataPerRead
>
{});
#pragma unroll
#pragma unroll
...
...
src/include/blockwise_gemm.hip.hpp
View file @
8a4b5978
...
@@ -46,7 +46,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
...
@@ -46,7 +46,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
N
%
(
NPerThreadSubC
*
NLevel0Cluster
*
NLevel1Cluster
)
==
0
,
N
%
(
NPerThreadSubC
*
NLevel0Cluster
*
NLevel1Cluster
)
==
0
,
"wrong! Cannot evenly divide work among
\n
"
);
"wrong! Cannot evenly divide work among
\n
"
);
static_assert
(
ThreadMatrixC
::
GetLengths
()
==
GetThreadMatrixCLengths
(),
static_assert
(
is_same_type
(
ThreadMatrixC
::
GetLengths
()
,
GetThreadMatrixCLengths
()
)
,
"wrong! ThreadMatrixC lengths is wrong"
);
"wrong! ThreadMatrixC lengths is wrong"
);
auto
c_thread_mtx_index
=
GetBeginOfThreadMatrixC
(
get_thread_local_1d_id
());
auto
c_thread_mtx_index
=
GetBeginOfThreadMatrixC
(
get_thread_local_1d_id
());
...
@@ -55,7 +55,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
...
@@ -55,7 +55,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
mMyThreadOffsetB
=
BlockMatrixB
::
GetOffsetFromMultiIndex
(
0
,
c_thread_mtx_index
.
col
);
mMyThreadOffsetB
=
BlockMatrixB
::
GetOffsetFromMultiIndex
(
0
,
c_thread_mtx_index
.
col
);
}
}
__device__
static
auto
GetThreadMatrixCLengths
()
__device__
static
constexpr
auto
GetThreadMatrixCLengths
()
{
{
constexpr
index_t
M
=
BlockMatrixA
::
NCol
();
// A is transposed
constexpr
index_t
M
=
BlockMatrixA
::
NCol
();
// A is transposed
constexpr
index_t
N
=
BlockMatrixB
::
NCol
();
constexpr
index_t
N
=
BlockMatrixB
::
NCol
();
...
...
src/include/blockwise_merged_tensor_slice_op.hip.hpp
View file @
8a4b5978
#pragma once
#pragma once
#include "threadwise_tensor_slice_op.hip.hpp"
#include "threadwise_tensor_slice_op.hip.hpp"
// slice a merged tensor, reorder and copy it into a normal tensor
// slice a (normal or merged) tensor, reorder and copy it into another (normal or merged) tensor
// src: a merged tensor,
// dst: a normal tensor
template
<
index_t
BlockSize
,
template
<
index_t
BlockSize
,
class
Float
,
class
Float
,
class
SrcDesc
,
class
SrcDesc
,
class
DstDesc
,
class
DstDesc
,
class
SliceLengths
,
class
SliceLengths
,
class
SubLengths
,
class
SubLengths
,
class
ClusterLengths
,
class
Data
ClusterLengths
,
class
ThreadClusterArrangeOrder
,
class
ThreadClusterArrangeOrder
,
class
SrcAccessOrder
,
class
SrcAccessOrder
,
class
DstAccessOrder
>
class
DstAccessOrder
,
index_t
SrcDataPerRead
,
index_t
DstDataPerRead
>
struct
BlockwiseTensorSliceCopy_generic_v1
struct
BlockwiseTensorSliceCopy_generic_v1
{
{
static
constexpr
index_t
nDim
=
SrcDesc
::
GetNumOfDimension
();
static
constexpr
index_t
nDim
=
SrcDesc
::
GetNumOfDimension
();
...
@@ -21,39 +21,44 @@ struct BlockwiseTensorSliceCopy_generic_v1
...
@@ -21,39 +21,44 @@ struct BlockwiseTensorSliceCopy_generic_v1
index_t
mSrcMyThreadOffset
;
index_t
mSrcMyThreadOffset
;
index_t
mDstMyThreadOffset
;
index_t
mDstMyThreadOffset
;
__device__
BlockwiseTensorSliceCopy_generic_v1
(
Array
<
index_t
,
nDim
>
src_block_multi_offset
,
__device__
Array
<
index_t
,
nDim
>
dst_block_multi_offset
)
BlockwiseTensorSliceCopy_generic_v1
(
Array
<
index_t
,
nDim
>
src_block_data_multi_id_begin
,
Array
<
index_t
,
nDim
>
dst_block_data_multi_id_begin
)
{
{
// check NDim consistent
// check NDim consistent
static_assert
(
SrcDesc
::
GetNumOfDimension
()
==
DstDesc
::
GetNumOfDimension
(),
"wrong"
);
static_assert
(
SrcDesc
::
GetNumOfDimension
()
==
DstDesc
::
GetNumOfDimension
(),
"wrong"
);
constexpr
auto
thread_cluster_desc
=
make_packed_ConstantTensorDescriptor
(
// thread cluster
ClusterLengths
{}.
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor_default_rank_packed
(
DataClusterLengths
{}.
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
// BlockSize
// BlockSize
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
"wrong! BlockSize"
);
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
"wrong! BlockSize"
);
// divide work
// divide work
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim
)
{
constexpr
auto
data_per_cluster_per_dims
=
SubLengths
{}
*
DataClusterLengths
{};
static_assert
(
SliceLengths
{}.
Get
(
IDim
)
%
SubLenghs
{}.
Get
(
IDim
)
==
0
,
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim_
)
{
constexpr
auto
IDim
=
decltype
(
IDim_
){};
static_assert
(
SliceLengths
::
Get
(
IDim
)
%
SubLengths
::
Get
(
IDim
)
==
0
,
"wrong! cannot evenly divide sliced tensor into sub-tensor"
);
"wrong! cannot evenly divide sliced tensor into sub-tensor"
);
static_assert
(
SliceLengths
::
Get
(
IDim
)
%
data_per_cluster_per_dims
.
Get
(
IDim
)
==
0
,
"wrong! cannot evenly divide sliced tensor into cluster"
);
});
});
constexpr
auto
thread_work_desc
=
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
data_per_cluster_per_dims
;
make_packed_ConstantTensorDescriptor
(
SliceLengths
{}
/
SliceSubLengths
{});
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim
)
{
// for now, only support SubLengths.Get() == 1 on a merged dimension that is merge from
static_assert
(
thread_work_desc
.
GetLength
(
IDim
)
%
thread_cluster_desc
.
Get
(
IDim
)
==
0
,
// multiple dimensions
"wrong! cannot evenly divide work to cluster"
);
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim_
)
{
}
)
;
constexpr
auto
IDim
=
decltype
(
IDim_
){
};
// only support SubLengths.Get() == 1 on merged dimension, for now
static_assert
(
SubLengths
::
Get
(
IDim
)
==
1
||
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim
)
{
(
!
SrcDesc
::
ContainMultipleOriginalDimensions
(
IDim
)
&&
static_if
<
(
SrcDesc
::
ContainMultipleOriginalDimensions
(
IDim
)
||
!
DstDesc
::
ContainMultipleOriginalDimensions
(
IDim
)),
DstDesc
::
ContainMultipleOriginalDimensions
(
IDim
))
>
{}([
&
](
auto
fwd
)
{
"wrong! only surpport Sub-Length == 1 on a merged dimension"
);
static_assert
(
fwd
(
SubLengths
{}).
Get
(
IDim
)
==
1
,
"wrong! Sub-Lengths on merged dimension should be 1"
);
});
});
});
// calculate mSrcMyThreadOffset, mDstMyThreadOffset
// calculate mSrcMyThreadOffset, mDstMyThreadOffset
...
@@ -63,22 +68,23 @@ struct BlockwiseTensorSliceCopy_generic_v1
...
@@ -63,22 +68,23 @@ struct BlockwiseTensorSliceCopy_generic_v1
const
auto
data_cluster_multi_id
=
const
auto
data_cluster_multi_id
=
reorder_array_given_old2new
(
thread_cluster_multi_id
,
ThreadClusterArrangeOrder
{});
reorder_array_given_old2new
(
thread_cluster_multi_id
,
ThreadClusterArrangeOrder
{});
const
auto
thread_data_multi_offset
=
data_cluster_multi_id
*
SubLengths
{};
const
auto
thread_data_multi_id_begin
=
data_cluster_multi_id
*
SubLengths
{};
mSrcMyThreadOffset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_block_data_multi_id_begin
+
thread_data_multi_id_begin
);
mSrcMythreadOffset
=
mSrcMyThreadOffset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_block_data_multi_id_begin
+
SrcDesc
::
GetOffsetFromMultiIndex
(
src_block_multi_offset
+
thread_data_multi_offset
);
thread_data_multi_id_begin
);
mSrcMythreadOffset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_block_multi_offset
+
thread_data_multi_offset
);
}
}
__device__
static
constexpr
index_t
GetRegisterClipboardSize
()
__device__
static
constexpr
index_t
GetRegisterClipboardSize
()
{
{
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
ClusterLengths
{});
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
Data
ClusterLengths
{});
constexpr
auto
thread_tensor_desc
=
constexpr
auto
thread_tensor_desc
=
make_
packed_
ConstantTensorDescriptor
(
SubLengths
{}
*
repeat_lengths
);
make_ConstantTensorDescriptor
_default_rank_packed
(
SubLengths
{}
*
repeat_lengths
);
return
thread_tensor_desc
.
GetElementSpace
Size
();
return
thread_tensor_desc
.
GetElementSpace
();
}
}
__device__
void
RunLoadRegisterClipboard
(
const
Float
*
__restrict__
p_src
,
__device__
void
RunLoadRegisterClipboard
(
const
Float
*
__restrict__
p_src
,
...
@@ -86,32 +92,34 @@ struct BlockwiseTensorSliceCopy_generic_v1
...
@@ -86,32 +92,34 @@ struct BlockwiseTensorSliceCopy_generic_v1
{
{
constexpr
auto
thread_sub_tensor_lengths
=
SubLengths
{};
constexpr
auto
thread_sub_tensor_lengths
=
SubLengths
{};
constexpr
auto
data_per_cluster_per_dims
=
thread_sub_tensor_lengths
*
ClusterLengths
{};
constexpr
auto
data_per_cluster_per_dims
=
thread_sub_tensor_lengths
*
Data
ClusterLengths
{};
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
ClusterLengths
{});
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
Data
ClusterLengths
{});
constexpr
auto
thread_tensor_desc
=
constexpr
auto
thread_tensor_desc
=
make_ConstantTensorDescriptor_default_rank_packed
(
make_packed_ConstantTensorDescriptor
(
thread_sub_tensor_lengths
*
repeat_lengths
);
thread_sub_tensor_lengths
*
repeat_lengths
);
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id_
)
{
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id_
)
{
constexpr
auto
repeat_multi_id
=
decltype
(
repeat_multi_id_
){};
constexpr
auto
repeat_multi_id
=
sequence2array
(
decltype
(
repeat_multi_id_
){});
const
auto
src_thread_data_multi_id_begin
=
repeat_multi_id
*
data_per_cluster_per_dims
;
// cannot not constexpr, why?
constexpr
auto
src_data_multi_offset
=
repeat_multi_id
*
data_per_cluster_per_dims
;
const
auto
clipboard_data_multi_id_begin
=
repeat_multi_id
*
thread_sub_tensor_lengths
;
// cannot not constexpr, why?
const
expr
auto
clipboard_data_multi_offset
=
const
index_t
src_offset
=
SrcDesc
{}.
GetOffsetFromMultiIndex
(
repeat_multi_id
*
thread_sub_tensor_lengths
;
src_thread_data_multi_id_begin
);
// cannot not constexpr, why?
constexpr
index_t
src_offset
=
SrcDesc
{}.
GetOffsetFromMultiIndex
(
src_data_multi_id
);
const
index_t
clipboard_offset
=
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
constexpr
index_t
clipboard_offset
=
clipboard_data_multi_id_begin
);
// cannot not constexpr, why?
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
clipboard_data_multi_id
);
threadwise_tensor_slice_copy_generic
(
SrcDesc
{},
threadwise_tensor_slice_copy_generic
(
SrcDesc
{},
p_src
+
src_offset
+
mSrcMyThreadOffset
,
p_src
+
src_offset
+
mSrcMyThreadOffset
,
thread_tensor_desc
,
make_zero_array
<
index_t
,
nDim
>
(),
zero_array
<
index_t
,
nDim
>
{},
thread_tensor_desc
,
thread_tensor_desc
,
p_clipboard
+
clipboard_offset
,
p_clipboard
+
clipboard_offset
,
zero_array
<
index_t
,
nDim
>
{}
,
make_
zero_array
<
index_t
,
nDim
>
()
,
thread_sub_tensor_lengths
,
thread_sub_tensor_lengths
,
SrcAccessOrder
{});
SrcAccessOrder
{});
});
});
...
@@ -122,34 +130,37 @@ struct BlockwiseTensorSliceCopy_generic_v1
...
@@ -122,34 +130,37 @@ struct BlockwiseTensorSliceCopy_generic_v1
{
{
constexpr
auto
thread_sub_tensor_lengths
=
SubLengths
{};
constexpr
auto
thread_sub_tensor_lengths
=
SubLengths
{};
constexpr
auto
data_per_cluster_per_dims
=
thread_sub_tensor_lengths
*
ClusterLengths
{};
constexpr
auto
data_per_cluster_per_dims
=
thread_sub_tensor_lengths
*
Data
ClusterLengths
{};
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
ClusterLengths
{});
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
Data
ClusterLengths
{});
constexpr
auto
thread_tensor_desc
=
constexpr
auto
thread_tensor_desc
=
make_ConstantTensorDescriptor_default_rank_packed
(
make_packed_ConstantTensorDescriptor
(
thread_sub_tensor_lengths
*
repeat_lengths
);
thread_sub_tensor_lengths
*
repeat_lengths
);
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id_
)
{
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id_
)
{
constexpr
auto
repeat_multi_id
=
decltype
(
repeat_multi_id_
){};
constexpr
auto
repeat_multi_id
=
sequence2array
(
decltype
(
repeat_multi_id_
){}
)
;
const
expr
auto
clipboard_data_multi_
offset
=
const
auto
clipboard_data_multi_
id_begin
=
repeat_multi_id
*
thread_sub_tensor_lengths
;
repeat_multi_id
*
thread_sub_tensor_lengths
;
// cannot not constexpr, why?
constexpr
auto
dst_data_multi_offset
=
repeat_multi_id
*
data_per_cluster_per_dims
;
const
auto
dst_data_multi_id_begin
=
repeat_multi_id
*
data_per_cluster_per_dims
;
// cannot not constexpr, why?
const
expr
index_t
clipboard_offset
=
const
index_t
clipboard_offset
=
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
clipboard_data_multi_offset
);
clipboard_data_multi_id_begin
);
// cannot not constexpr, why?
constexpr
index_t
dst_offset
=
DstDesc
{}.
GetOffsetFromMultiIndex
(
dst_data_multi_offset
);
const
index_t
dst_offset
=
DstDesc
{}.
GetOffsetFromMultiIndex
(
dst_data_multi_id_begin
);
// cannot not constexpr, why?
threadwise_tensor_slice_copy_generic
(
thread_tensor_desc
,
threadwise_tensor_slice_copy_generic
(
thread_tensor_desc
,
p_clipboard
+
clipboard_offset
,
p_clipboard
+
clipboard_offset
,
zero_array
<
index_t
,
nDim
>
{}
,
make_
zero_array
<
index_t
,
nDim
>
()
,
DstDesc
{},
DstDesc
{},
p_dst
+
dst_offset
+
mDstMyThreadOffset
,
p_dst
+
dst_offset
+
mDstMyThreadOffset
,
zero_array
<
index_t
,
nDim
>
{}
,
make_
zero_array
<
index_t
,
nDim
>
()
,
thread_sub_tensor_lengths
,
thread_sub_tensor_lengths
,
DstAccessOrder
{});
DstAccessOrder
{});
});
}
}
__device__
void
Run
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
__device__
void
Run
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
...
@@ -159,4 +170,4 @@ struct BlockwiseTensorSliceCopy_generic_v1
...
@@ -159,4 +170,4 @@ struct BlockwiseTensorSliceCopy_generic_v1
RunLoadRegisterClipboard
(
p_src
,
p_clipboard
);
RunLoadRegisterClipboard
(
p_src
,
p_clipboard
);
RunStoreRegisterClipboard
(
p_clipboard
,
p_dst
);
RunStoreRegisterClipboard
(
p_clipboard
,
p_dst
);
}
}
};
};
src/include/blockwise_tensor_slice_op.hip.hpp
View file @
8a4b5978
...
@@ -40,7 +40,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
...
@@ -40,7 +40,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
src_cluster_lengths
.
ReorderGivenNew2Old
(
map_thread_cluster_2_src_cluster
);
src_cluster_lengths
.
ReorderGivenNew2Old
(
map_thread_cluster_2_src_cluster
);
constexpr
auto
thread_cluster_desc
=
constexpr
auto
thread_cluster_desc
=
make_
packed_
ConstantTensorDescriptor
(
thread_cluster_lengths
);
make_ConstantTensorDescriptor
_default_rank_packed
(
thread_cluster_lengths
);
// sanity check: data type
// sanity check: data type
static_assert
(
is_same
<
Float
,
float
>::
value
,
"wrong! only support float for now!
\n
"
);
static_assert
(
is_same
<
Float
,
float
>::
value
,
"wrong! only support float for now!
\n
"
);
...
@@ -149,7 +149,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
...
@@ -149,7 +149,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr
auto
thread_tensor_lengths
=
thread_sub_tensor_lengths
*
repeat_lengths
;
constexpr
auto
thread_tensor_lengths
=
thread_sub_tensor_lengths
*
repeat_lengths
;
constexpr
auto
thread_tensor_desc
=
constexpr
auto
thread_tensor_desc
=
make_
packed_
ConstantTensorDescriptor
(
thread_tensor_lengths
);
make_ConstantTensorDescriptor
_default_rank_packed
(
thread_tensor_lengths
);
return
thread_tensor_desc
.
GetElementSpace
();
return
thread_tensor_desc
.
GetElementSpace
();
}
}
...
@@ -170,7 +170,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
...
@@ -170,7 +170,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr
auto
thread_tensor_lengths
=
thread_sub_tensor_lengths
*
repeat_lengths
;
constexpr
auto
thread_tensor_lengths
=
thread_sub_tensor_lengths
*
repeat_lengths
;
constexpr
auto
thread_tensor_desc
=
constexpr
auto
thread_tensor_desc
=
make_
packed_
ConstantTensorDescriptor
(
thread_tensor_lengths
);
make_ConstantTensorDescriptor
_default_rank_packed
(
thread_tensor_lengths
);
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id_
)
{
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id_
)
{
constexpr
auto
repeat_multi_id
=
decltype
(
repeat_multi_id_
){};
constexpr
auto
repeat_multi_id
=
decltype
(
repeat_multi_id_
){};
...
@@ -208,7 +208,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
...
@@ -208,7 +208,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr
auto
thread_tensor_lengths
=
thread_sub_tensor_lengths
*
repeat_lengths
;
constexpr
auto
thread_tensor_lengths
=
thread_sub_tensor_lengths
*
repeat_lengths
;
constexpr
auto
thread_tensor_desc
=
constexpr
auto
thread_tensor_desc
=
make_
packed_
ConstantTensorDescriptor
(
thread_tensor_lengths
);
make_ConstantTensorDescriptor
_default_rank_packed
(
thread_tensor_lengths
);
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id_
)
{
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id_
)
{
constexpr
auto
repeat_multi_id
=
decltype
(
repeat_multi_id_
){};
constexpr
auto
repeat_multi_id
=
decltype
(
repeat_multi_id_
){};
...
...
src/include/common.hip.hpp
View file @
8a4b5978
#pragma once
#pragma once
#include "vector_type.hip.hpp"
#include "vector_type.hip.hpp"
#include "
constant_integral
.hip.hpp"
#include "
integral_constant
.hip.hpp"
#include "Sequence.hip.hpp"
#include "Sequence.hip.hpp"
#include "Array.hip.hpp"
#include "Array.hip.hpp"
#include "functional.hip.hpp"
#include "functional.hip.hpp"
...
@@ -17,15 +17,21 @@ __device__ index_t get_block_1d_id() { return blockIdx.x; }
...
@@ -17,15 +17,21 @@ __device__ index_t get_block_1d_id() { return blockIdx.x; }
template
<
class
T1
,
class
T2
>
template
<
class
T1
,
class
T2
>
struct
is_same
struct
is_same
{
{
static
const
bool
value
=
false
;
static
const
expr
bool
value
=
false
;
};
};
template
<
class
T
>
template
<
class
T
>
struct
is_same
<
T
,
T
>
struct
is_same
<
T
,
T
>
{
{
static
const
bool
value
=
true
;
static
const
expr
bool
value
=
true
;
};
};
template
<
class
X
,
class
Y
>
__host__
__device__
constexpr
bool
is_same_type
(
X
,
Y
)
{
return
is_same
<
X
,
Y
>::
value
;
}
namespace
mod_conv
{
// namespace mod_conv
namespace
mod_conv
{
// namespace mod_conv
template
<
class
T
,
T
s
>
template
<
class
T
,
T
s
>
struct
scales
struct
scales
...
...
src/include/conv_common.hip.hpp
View file @
8a4b5978
...
@@ -30,7 +30,7 @@ __host__ __device__ constexpr auto get_convolution_output_default_4d_tensor_desc
...
@@ -30,7 +30,7 @@ __host__ __device__ constexpr auto get_convolution_output_default_4d_tensor_desc
constexpr
auto
HO
=
HI
+
1
-
Y
;
constexpr
auto
HO
=
HI
+
1
-
Y
;
constexpr
auto
WO
=
WI
+
1
-
X
;
constexpr
auto
WO
=
WI
+
1
-
X
;
return
make_
packed_
ConstantTensorDescriptor
(
Sequence
<
N
,
K
,
HO
,
WO
>
{});
return
make_ConstantTensorDescriptor
_default_rank_packed
(
Sequence
<
N
,
K
,
HO
,
WO
>
{});
}
}
template
<
class
InDesc
,
class
WeiDesc
,
class
LowerPads
,
class
UpperPads
>
template
<
class
InDesc
,
class
WeiDesc
,
class
LowerPads
,
class
UpperPads
>
...
@@ -67,7 +67,7 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4
...
@@ -67,7 +67,7 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4
constexpr
auto
HO
=
HI
+
HPadLow
+
HPadUp
+
1
-
Y
;
constexpr
auto
HO
=
HI
+
HPadLow
+
HPadUp
+
1
-
Y
;
constexpr
auto
WO
=
WI
+
WPadLow
+
WPadUp
+
1
-
X
;
constexpr
auto
WO
=
WI
+
WPadLow
+
WPadUp
+
1
-
X
;
return
make_
packed_
ConstantTensorDescriptor
(
Sequence
<
N
,
K
,
HO
,
WO
>
{});
return
make_ConstantTensorDescriptor
_default_rank_packed
(
Sequence
<
N
,
K
,
HO
,
WO
>
{});
}
}
template
<
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
template
<
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
...
...
src/include/functional.hip.hpp
View file @
8a4b5978
#pragma once
#pragma once
#include "
constant_integral
.hip.hpp"
#include "
integral_constant
.hip.hpp"
struct
forwarder
struct
forwarder
{
{
...
...
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp
View file @
8a4b5978
...
@@ -85,7 +85,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
...
@@ -85,7 +85,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
constexpr
index_t
WBlockWork
=
mod_conv
::
integer_divide_ceil
(
Wo
,
WoPerBlock
);
constexpr
index_t
WBlockWork
=
mod_conv
::
integer_divide_ceil
(
Wo
,
WoPerBlock
);
constexpr
index_t
NBlockWork
=
mod_conv
::
integer_divide_ceil
(
N
,
NPerBlock
);
constexpr
index_t
NBlockWork
=
mod_conv
::
integer_divide_ceil
(
N
,
NPerBlock
);
constexpr
auto
block_work_desc
=
make_
packed_
ConstantTensorDescriptor
(
constexpr
auto
block_work_desc
=
make_ConstantTensorDescriptor
_default_rank_packed
(
Sequence
<
KBlockWork
,
HBlockWork
,
WBlockWork
,
NBlockWork
>
{});
Sequence
<
KBlockWork
,
HBlockWork
,
WBlockWork
,
NBlockWork
>
{});
const
auto
block_work_multi_id
=
const
auto
block_work_multi_id
=
...
@@ -109,7 +109,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
...
@@ -109,7 +109,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
GemmDataPerReadA
,
GemmDataPerReadA
,
GemmDataPerReadB
);
GemmDataPerReadB
);
constexpr
auto
in_c_h_w_n_block_desc
=
make_
ranked_
ConstantTensorDescriptor_
with
_align
ment
(
constexpr
auto
in_c_h_w_n_block_desc
=
make_ConstantTensorDescriptor_
default_rank
_align
ed
(
Sequence
<
CPerBlock
,
HoPerBlock
,
WoPerBlock
,
NPerBlock
>
{},
Sequence
<
CPerBlock
,
HoPerBlock
,
WoPerBlock
,
NPerBlock
>
{},
Number
<
InBlockCopyDataPerRead_N
>
{});
Number
<
InBlockCopyDataPerRead_N
>
{});
...
@@ -118,12 +118,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
...
@@ -118,12 +118,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
static_assert
(
in_c_h_w_n_block_desc
.
GetStride
(
I1
)
%
GemmDataPerReadB
==
0
,
static_assert
(
in_c_h_w_n_block_desc
.
GetStride
(
I1
)
%
GemmDataPerReadB
==
0
,
"GemmDataPerReadB alignment requirement is not meet"
);
"GemmDataPerReadB alignment requirement is not meet"
);
constexpr
auto
wei_c_k_block_desc
=
make_
ranked_
ConstantTensorDescriptor_
with
_align
ment
(
constexpr
auto
wei_c_k_block_desc
=
make_ConstantTensorDescriptor_
default_rank
_align
ed
(
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
// tensor view of threadwise output in register
// tensor view of threadwise output in register
constexpr
auto
out_k_h_w_n_thread_desc
=
make_
packed_
ConstantTensorDescriptor
(
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor
_default_rank_packed
(
Sequence
<
KPerThread
,
HoPerThread
,
WoPerThread
,
NPerThread
>
{});
Sequence
<
KPerThread
,
HoPerThread
,
WoPerThread
,
NPerThread
>
{});
// blockwise copy
// blockwise copy
...
...
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
View file @
8a4b5978
...
@@ -86,7 +86,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
...
@@ -86,7 +86,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
constexpr
index_t
HBlockWork
=
mod_conv
::
integer_divide_ceil
(
Ho
,
HoPerBlock
);
constexpr
index_t
HBlockWork
=
mod_conv
::
integer_divide_ceil
(
Ho
,
HoPerBlock
);
constexpr
index_t
WBlockWork
=
mod_conv
::
integer_divide_ceil
(
Wo
,
WoPerBlock
);
constexpr
index_t
WBlockWork
=
mod_conv
::
integer_divide_ceil
(
Wo
,
WoPerBlock
);
constexpr
auto
block_work_desc
=
make_
packed_
ConstantTensorDescriptor
(
constexpr
auto
block_work_desc
=
make_ConstantTensorDescriptor
_default_rank_packed
(
Sequence
<
NBlockWork
,
KBlockWork
,
HBlockWork
,
WBlockWork
>
{});
Sequence
<
NBlockWork
,
KBlockWork
,
HBlockWork
,
WBlockWork
>
{});
const
auto
block_work_multi_id
=
const
auto
block_work_multi_id
=
...
@@ -102,7 +102,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
...
@@ -102,7 +102,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
// global tensor view
// global tensor view
constexpr
auto
wei_c_k_global_desc
=
constexpr
auto
wei_c_k_global_desc
=
make_
ranked_
ConstantTensorDescriptor
(
Sequence
<
C
,
K
>
{},
Sequence
<
Y
*
X
*
K
,
1
>
{});
make_ConstantTensorDescriptor
_default_rank
(
Sequence
<
C
,
K
>
{},
Sequence
<
Y
*
X
*
K
,
1
>
{});
// LDS tensor view
// LDS tensor view
// be careful of alignment
// be careful of alignment
...
@@ -111,7 +111,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
...
@@ -111,7 +111,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
GemmDataPerReadA
,
GemmDataPerReadA
,
GemmDataPerReadB
);
GemmDataPerReadB
);
constexpr
auto
in_c_h_w_n_block_desc
=
make_
ranked_
ConstantTensorDescriptor_
with
_align
ment
(
constexpr
auto
in_c_h_w_n_block_desc
=
make_ConstantTensorDescriptor_
default_rank
_align
ed
(
Sequence
<
CPerBlock
,
HoPerBlock
,
WoPerBlock
,
NPerBlock
>
{},
Sequence
<
CPerBlock
,
HoPerBlock
,
WoPerBlock
,
NPerBlock
>
{},
Number
<
InBlockReorderDataPerWrite_N
>
{});
Number
<
InBlockReorderDataPerWrite_N
>
{});
...
@@ -120,12 +120,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
...
@@ -120,12 +120,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
static_assert
(
in_c_h_w_n_block_desc
.
GetStride
(
I1
)
%
GemmDataPerReadB
==
0
,
static_assert
(
in_c_h_w_n_block_desc
.
GetStride
(
I1
)
%
GemmDataPerReadB
==
0
,
"GemmDataPerReadB alignment requirement is not meet"
);
"GemmDataPerReadB alignment requirement is not meet"
);
constexpr
auto
wei_c_k_block_desc
=
make_
ranked_
ConstantTensorDescriptor_
with
_align
ment
(
constexpr
auto
wei_c_k_block_desc
=
make_ConstantTensorDescriptor_
default_rank
_align
ed
(
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
// tensor view of threadwise output in register
// tensor view of threadwise output in register
constexpr
auto
out_k_h_w_n_thread_desc
=
make_
packed_
ConstantTensorDescriptor
(
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor
_default_rank_packed
(
Sequence
<
KPerThread
,
HoPerThread
,
WoPerThread
,
NPerThread
>
{});
Sequence
<
KPerThread
,
HoPerThread
,
WoPerThread
,
NPerThread
>
{});
// blockwise copy
// blockwise copy
...
@@ -448,10 +448,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
...
@@ -448,10 +448,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
constexpr
index_t
K1
=
KPerBlock
/
KPerThread
;
constexpr
index_t
K1
=
KPerBlock
/
KPerThread
;
#if 0
#if 0
constexpr auto out_10d_global_desc = make_
packed_
ConstantTensorDescriptor(
constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor
_default_rank_packed
(
Sequence<K / (K1 * K2), K1, K2, Ho, Wo / (W1 * W2 * W3), W1, W2, W3, N / N1, N1>{});
Sequence<K / (K1 * K2), K1, K2, Ho, Wo / (W1 * W2 * W3), W1, W2, W3, N / N1, N1>{});
constexpr auto out_10d_thread_desc = make_
packed_
ConstantTensorDescriptor(
constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor
_default_rank_packed
(
Sequence<KPerThread / K2, 1, K2, HoPerThread, 1, W1, 1, W3, 1, N1>{});
Sequence<KPerThread / K2, 1, K2, HoPerThread, 1, W1, 1, W3, 1, N1>{});
#else
#else
constexpr
auto
out_10d_global_desc
=
constexpr
auto
out_10d_global_desc
=
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment