Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
cd100621
Commit
cd100621
authored
May 10, 2021
by
Chao Liu
Browse files
refactor
parent
d8c89b68
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
135 additions
and
129 deletions
+135
-129
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+68
-65
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
...convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
+67
-64
No files found.
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
cd100621
...
@@ -487,71 +487,74 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(
...
@@ -487,71 +487,74 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(
in_left_pads
,
in_left_pads
,
in_right_pads
);
in_right_pads
);
float
ave_time
=
launch_kernel_dynamic_gemm_v1
<
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
BlockSize
,
{
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
,
float
ave_time
=
launch_kernel_dynamic_gemm_v1
<
TAcc
,
BlockSize
,
TOut
,
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
,
InMemoryDataOperation
::
Set
,
TAcc
,
decltype
(
descs
[
I0
]),
TOut
,
decltype
(
descs
[
I1
]),
InMemoryDataOperation
::
Set
,
decltype
(
descs
[
I2
]),
decltype
(
descs
[
I0
]),
decltype
(
descs
[
I3
]),
decltype
(
descs
[
I1
]),
GemmMPerBlock
,
decltype
(
descs
[
I2
]),
GemmNPerBlock
,
decltype
(
descs
[
I3
]),
GemmKPerBlock
,
GemmMPerBlock
,
GemmMPerThread
,
GemmNPerBlock
,
GemmNPerThread
,
GemmKPerBlock
,
GemmKPerThread
,
GemmMPerThread
,
GemmMLevel0Cluster
,
GemmNPerThread
,
GemmNLevel0Cluster
,
GemmKPerThread
,
GemmMLevel1Cluster
,
GemmMLevel0Cluster
,
GemmNLevel1Cluster
,
GemmNLevel0Cluster
,
GemmABlockTransferThreadSliceLengths_GemmK_GemmM
,
GemmMLevel1Cluster
,
GemmABlockTransferThreadClusterLengths_GemmK_GemmM
,
GemmNLevel1Cluster
,
Sequence
<
1
,
0
>
,
GemmABlockTransferThreadSliceLengths_GemmK_GemmM
,
Sequence
<
1
,
0
>
,
GemmABlockTransferThreadClusterLengths_GemmK_GemmM
,
0
,
Sequence
<
1
,
0
>
,
GemmABlockTransferSrcScalarPerVector_GemmK
,
Sequence
<
1
,
0
>
,
GemmABlockTransferDstScalarPerVector_GemmM
,
0
,
false
,
// don't move back src coordinate after threadwise copy
GemmABlockTransferSrcScalarPerVector_GemmK
,
GemmBBlockTransferThreadSliceLengths_GemmK_GemmN
,
GemmABlockTransferDstScalarPerVector_GemmM
,
GemmBBlockTransferThreadClusterLengths_GemmK_GemmN
,
false
,
// don't move back src coordinate after threadwise copy
Sequence
<
0
,
1
>
,
GemmBBlockTransferThreadSliceLengths_GemmK_GemmN
,
Sequence
<
0
,
1
>
,
GemmBBlockTransferThreadClusterLengths_GemmK_GemmN
,
1
,
Sequence
<
0
,
1
>
,
GemmBBlockTransferSrcScalarPerVector_GemmN
,
Sequence
<
0
,
1
>
,
GemmBBlockTransferDstScalarPerVector_GemmN
,
1
,
false
,
// don't move back src coordinate after threadwise copy, which will be fused with
GemmBBlockTransferSrcScalarPerVector_GemmN
,
// MoveSrcSliceWindow() to save addr computation
GemmBBlockTransferDstScalarPerVector_GemmN
,
Sequence
<
2
,
3
,
0
,
1
>
,
false
,
// don't move back src coordinate after threadwise copy, which will be fused with
3
,
// MoveSrcSliceWindow() to save addr computation
GemmCThreadTransferDstScalarPerVector_GemmN1
,
Sequence
<
2
,
3
,
0
,
1
>
,
decltype
(
descs
[
I4
]),
3
,
decltype
(
descs
[
I5
]),
GemmCThreadTransferDstScalarPerVector_GemmN1
,
decltype
(
descs
[
I6
]),
decltype
(
descs
[
I4
]),
decltype
(
descs
[
I7
]),
decltype
(
descs
[
I5
]),
decltype
(
descs
[
I8
])
>
(
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
decltype
(
descs
[
I6
]),
wei_k_c_y_x_device_buf
.
GetDeviceBuffer
()),
decltype
(
descs
[
I7
]),
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
decltype
(
descs
[
I8
])
>
(
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
in_n_c_hi_wi_device_buf
.
GetDeviceBuffer
()),
wei_k_c_y_x_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
out_n_k_ho_wo_device_buf
.
GetDeviceBuffer
()),
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
descs
[
I0
],
in_n_c_hi_wi_device_buf
.
GetDeviceBuffer
()),
descs
[
I1
],
static_cast
<
TOut
*>
(
out_n_k_ho_wo_device_buf
.
GetDeviceBuffer
()),
descs
[
I2
],
descs
[
I0
],
descs
[
I3
],
descs
[
I1
],
descs
[
I4
],
descs
[
I2
],
descs
[
I5
],
descs
[
I3
],
descs
[
I6
],
descs
[
I4
],
descs
[
I7
],
descs
[
I5
],
descs
[
I8
],
descs
[
I6
],
nrepeat
);
descs
[
I7
],
descs
[
I8
],
float
perf
=
(
float
)
calculate_convolution_flops
(
nrepeat
);
in_n_c_hi_wi_desc
,
wei_k_c_y_x_desc
,
out_n_k_ho_wo_desc
)
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
;
float
perf
=
(
float
)
calculate_convolution_flops
(
in_n_c_hi_wi_desc
,
wei_k_c_y_x_desc
,
out_n_k_ho_wo_desc
)
/
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
;
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
}
// copy result back to host
// copy result back to host
out_n_k_ho_wo_device_buf
.
FromDevice
(
out_n_k_ho_wo
.
mData
.
data
());
out_n_k_ho_wo_device_buf
.
FromDevice
(
out_n_k_ho_wo
.
mData
.
data
());
...
...
driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
View file @
cd100621
...
@@ -393,70 +393,73 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
...
@@ -393,70 +393,73 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk(
in_left_pads
,
in_left_pads
,
in_right_pads
);
in_right_pads
);
float
ave_time
=
launch_kernel_dynamic_gemm_v1
<
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
BlockSize
,
{
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
,
float
ave_time
=
launch_kernel_dynamic_gemm_v1
<
TAcc
,
BlockSize
,
TOut
,
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
,
InMemoryDataOperation
::
Set
,
TAcc
,
decltype
(
descs
[
I0
]),
TOut
,
decltype
(
descs
[
I1
]),
InMemoryDataOperation
::
Set
,
decltype
(
descs
[
I2
]),
decltype
(
descs
[
I0
]),
decltype
(
descs
[
I3
]),
decltype
(
descs
[
I1
]),
GemmMPerBlock
,
decltype
(
descs
[
I2
]),
GemmNPerBlock
,
decltype
(
descs
[
I3
]),
GemmKPerBlock
,
GemmMPerBlock
,
GemmMPerThread
,
GemmNPerBlock
,
GemmNPerThread
,
GemmKPerBlock
,
GemmKPerThread
,
GemmMPerThread
,
GemmMLevel0Cluster
,
GemmNPerThread
,
GemmNLevel0Cluster
,
GemmKPerThread
,
GemmMLevel1Cluster
,
GemmMLevel0Cluster
,
GemmNLevel1Cluster
,
GemmNLevel0Cluster
,
GemmABlockTransferThreadSliceLengths_GemmK_GemmM
,
GemmMLevel1Cluster
,
GemmABlockTransferThreadClusterLengths_GemmK_GemmM
,
GemmNLevel1Cluster
,
Sequence
<
1
,
0
>
,
GemmABlockTransferThreadSliceLengths_GemmK_GemmM
,
Sequence
<
1
,
0
>
,
GemmABlockTransferThreadClusterLengths_GemmK_GemmM
,
0
,
Sequence
<
1
,
0
>
,
GemmABlockTransferSrcScalarPerVector_GemmK
,
Sequence
<
1
,
0
>
,
GemmABlockTransferDstScalarPerVector_GemmM
,
0
,
false
,
// don't move back src coordinate after threadwise copy
GemmABlockTransferSrcScalarPerVector_GemmK
,
GemmBBlockTransferThreadSliceLengths_GemmK_GemmN
,
GemmABlockTransferDstScalarPerVector_GemmM
,
GemmBBlockTransferThreadClusterLengths_GemmK_GemmN
,
false
,
// don't move back src coordinate after threadwise copy
Sequence
<
1
,
0
>
,
GemmBBlockTransferThreadSliceLengths_GemmK_GemmN
,
Sequence
<
1
,
0
>
,
GemmBBlockTransferThreadClusterLengths_GemmK_GemmN
,
0
,
Sequence
<
1
,
0
>
,
GemmBBlockTransferSrcScalarPerVector_GemmK
,
Sequence
<
1
,
0
>
,
GemmBBlockTransferDstScalarPerVector_GemmN
,
0
,
false
,
// don't move back src coordinate after threadwise copy, which will be fused with
GemmBBlockTransferSrcScalarPerVector_GemmK
,
// MoveSrcSliceWindow() to save addr computation
GemmBBlockTransferDstScalarPerVector_GemmN
,
Sequence
<
2
,
3
,
0
,
1
>
,
false
,
// don't move back src coordinate after threadwise copy, which will be fused with
1
,
// MoveSrcSliceWindow() to save addr computation
GemmCThreadTransferDstScalarPerVector_GemmM1
,
Sequence
<
2
,
3
,
0
,
1
>
,
decltype
(
descs
[
I4
]),
1
,
decltype
(
descs
[
I5
]),
GemmCThreadTransferDstScalarPerVector_GemmM1
,
decltype
(
descs
[
I6
]),
decltype
(
descs
[
I4
]),
decltype
(
descs
[
I7
]),
decltype
(
descs
[
I5
]),
decltype
(
descs
[
I8
])
>
(
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
decltype
(
descs
[
I6
]),
wei_k_y_x_c_device_buf
.
GetDeviceBuffer
()),
decltype
(
descs
[
I7
]),
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
decltype
(
descs
[
I8
])
>
(
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
in_n_hi_wi_c_device_buf
.
GetDeviceBuffer
()),
wei_k_y_x_c_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
out_n_ho_wo_k_device_buf
.
GetDeviceBuffer
()),
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
descs
[
I0
],
in_n_hi_wi_c_device_buf
.
GetDeviceBuffer
()),
descs
[
I1
],
static_cast
<
TOut
*>
(
out_n_ho_wo_k_device_buf
.
GetDeviceBuffer
()),
descs
[
I2
],
descs
[
I0
],
descs
[
I3
],
descs
[
I1
],
descs
[
I4
],
descs
[
I2
],
descs
[
I5
],
descs
[
I3
],
descs
[
I6
],
descs
[
I4
],
descs
[
I7
],
descs
[
I5
],
descs
[
I8
],
descs
[
I6
],
nrepeat
);
descs
[
I7
],
descs
[
I8
],
float
perf
=
(
float
)(
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C
*
Y
*
X
)
/
nrepeat
);
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
;
float
perf
=
(
float
)(
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C
*
Y
*
X
)
/
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
;
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
}
// copy result back to host
// copy result back to host
out_n_ho_wo_k_device_buf
.
FromDevice
(
out_n_ho_wo_k
.
mData
.
data
());
out_n_ho_wo_k_device_buf
.
FromDevice
(
out_n_ho_wo_k
.
mData
.
data
());
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment