Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel
Commits
56fc0842
You need to sign in or sign up before continuing.
Commit
56fc0842
authored
Aug 09, 2021
by
Chao Liu
Browse files
tidy
parent
54fba515
Changes
23
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
33 additions
and
93 deletions
+33
-93
CMakeLists.txt
CMakeLists.txt
+1
-2
composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp
...el/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp
+3
-3
composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
...rnel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
+0
-3
composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
...kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
+0
-1
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r2.hpp
...ude/tensor_operation/gridwise_dynamic_gemm_dlops_v1r2.hpp
+0
-11
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r3.hpp
...ude/tensor_operation/gridwise_dynamic_gemm_dlops_v1r3.hpp
+1
-14
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v2.hpp
...clude/tensor_operation/gridwise_dynamic_gemm_dlops_v2.hpp
+0
-2
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp
...de/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp
+0
-5
composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp
...nel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp
+0
-2
host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
...ution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
+2
-8
host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
...ion_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
+1
-6
host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
...on_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
+2
-8
host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
...on_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
+1
-6
host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
...ution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+1
-1
host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
...ution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
+2
-2
host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
...ution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+5
-4
host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
...orward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
+5
-4
host/driver_offline/src/conv_bwd_driver_offline.cpp
host/driver_offline/src/conv_bwd_driver_offline.cpp
+8
-8
host/driver_offline/src/conv_fwd_driver_offline.cpp
host/driver_offline/src/conv_fwd_driver_offline.cpp
+0
-2
host/host_tensor/include/conv_common.hpp
host/host_tensor/include/conv_common.hpp
+1
-1
No files found.
CMakeLists.txt
View file @
56fc0842
...
@@ -142,12 +142,11 @@ enable_clang_tidy(
...
@@ -142,12 +142,11 @@ enable_clang_tidy(
-cppcoreguidelines-prefer-member-initializer
-cppcoreguidelines-prefer-member-initializer
${
MIOPEN_TIDY_CHECKS
}
${
MIOPEN_TIDY_CHECKS
}
${
MIOPEN_TIDY_ERRORS
}
${
MIOPEN_TIDY_ERRORS
}
HEADER_FILTER
HEADER_FILTER
"\.hpp$"
"\.hpp$"
EXTRA_ARGS
EXTRA_ARGS
-DMIOPEN_USE_CLANG_TIDY
-DMIOPEN_USE_CLANG_TIDY
)
)
include
(
CppCheck
)
include
(
CppCheck
)
...
...
composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp
View file @
56fc0842
...
@@ -71,7 +71,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2
...
@@ -71,7 +71,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2
static
constexpr
index_t
N0
=
N
/
N1
;
static
constexpr
index_t
N0
=
N
/
N1
;
__host__
__device__
static
constexpr
auto
__host__
__device__
static
constexpr
auto
MakeAKM0M1BlockDescriptor
(
const
AKMBlockDesc
&
a_k_m_block_desc
)
MakeAKM0M1BlockDescriptor
(
const
AKMBlockDesc
&
/*
a_k_m_block_desc
*/
)
{
{
const
auto
a_k_m0_m1_block_desc
=
transform_dynamic_tensor_descriptor
(
const
auto
a_k_m0_m1_block_desc
=
transform_dynamic_tensor_descriptor
(
AKMBlockDesc
{},
AKMBlockDesc
{},
...
@@ -84,7 +84,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2
...
@@ -84,7 +84,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2
}
}
__host__
__device__
static
constexpr
auto
__host__
__device__
static
constexpr
auto
MakeBKN0N1BlockDescriptor
(
const
BKNBlockDesc
&
b_k_n_block_desc
)
MakeBKN0N1BlockDescriptor
(
const
BKNBlockDesc
&
/*
b_k_n_block_desc
*/
)
{
{
const
auto
b_k_n0_n1_block_desc
=
transform_dynamic_tensor_descriptor
(
const
auto
b_k_n0_n1_block_desc
=
transform_dynamic_tensor_descriptor
(
BKNBlockDesc
{},
BKNBlockDesc
{},
...
@@ -194,7 +194,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2
...
@@ -194,7 +194,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2
typename
ABlockBuffer
,
typename
ABlockBuffer
,
typename
BBlockBuffer
,
typename
BBlockBuffer
,
typename
CThreadBuffer
>
typename
CThreadBuffer
>
__device__
void
Run
(
const
CM0M1N0N1ThreadDesc
&
c_m0_m1_n0_n1_thread_desc
,
__device__
void
Run
(
const
CM0M1N0N1ThreadDesc
&
/*
c_m0_m1_n0_n1_thread_desc
*/
,
const
ABlockBuffer
&
a_block_buf
,
const
ABlockBuffer
&
a_block_buf
,
const
BBlockBuffer
&
b_block_buf
,
const
BBlockBuffer
&
b_block_buf
,
CThreadBuffer
&
c_thread_buf
)
const
CThreadBuffer
&
c_thread_buf
)
const
...
...
composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
View file @
56fc0842
...
@@ -120,9 +120,6 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
...
@@ -120,9 +120,6 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
"wrong! inconsistent type"
);
"wrong! inconsistent type"
);
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
a_block_mtx
=
BlockMatrixA
{};
constexpr
auto
a_block_mtx
=
BlockMatrixA
{};
...
...
composable_kernel/include/tensor_operation/blockwise_gemm_xdlops.hpp
View file @
56fc0842
...
@@ -270,7 +270,6 @@ struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1_2x2pipeline
...
@@ -270,7 +270,6 @@ struct BlockwiseGemmXdlops_km_kn_m0m1m2n_v1_2x2pipeline
const
index_t
waveId
=
thread_id
/
WaveSize
;
const
index_t
waveId
=
thread_id
/
WaveSize
;
const
index_t
laneId
=
thread_id
%
WaveSize
;
const
index_t
laneId
=
thread_id
%
WaveSize
;
const
index_t
waveId_m
=
waveId
/
NWaves
;
const
index_t
waveId_m
=
waveId
/
NWaves
;
const
index_t
waveId_n
=
waveId
%
NWaves
;
if
constexpr
(
xdlops_gemm
.
IsKReduction
)
if
constexpr
(
xdlops_gemm
.
IsKReduction
)
{
{
...
...
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r2.hpp
View file @
56fc0842
...
@@ -619,17 +619,6 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
...
@@ -619,17 +619,6 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r2
// output: register to global memory
// output: register to global memory
{
{
constexpr
index_t
M11
=
M1PerThreadM111
*
M11N11ThreadClusterM1100
*
M11N11ThreadClusterM1101
;
constexpr
index_t
N11
=
N1PerThreadN111
*
M11N11ThreadClusterN1100
*
M11N11ThreadClusterN1101
;
constexpr
index_t
M10
=
MPerBlockM1
/
M11
;
constexpr
index_t
N10
=
NPerBlockN1
/
N11
;
constexpr
index_t
M111
=
M1PerThreadM111
;
constexpr
index_t
N111
=
N1PerThreadN111
;
constexpr
auto
c_m0_m10_m11_n0_n10_n11_thread_desc
=
constexpr
auto
c_m0_m10_m11_n0_n10_n11_thread_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
I1
,
make_tuple
(
I1
,
...
...
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v1r3.hpp
View file @
56fc0842
...
@@ -191,12 +191,12 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
...
@@ -191,12 +191,12 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
const
auto
M
=
a_k0_m_k1_grid_desc
.
GetLength
(
I1
);
const
auto
M
=
a_k0_m_k1_grid_desc
.
GetLength
(
I1
);
const
auto
N
=
b_k0_n_k1_grid_desc
.
GetLength
(
I1
);
const
auto
N
=
b_k0_n_k1_grid_desc
.
GetLength
(
I1
);
const
auto
K0
=
a_k0_m_k1_grid_desc
.
GetLength
(
I0
);
const
auto
K0
=
a_k0_m_k1_grid_desc
.
GetLength
(
I0
);
const
auto
K1
=
a_k0_m_k1_grid_desc
.
GetLength
(
I2
);
// TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
// TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
return
(
M
==
c_m_n_grid_desc
.
GetLength
(
I0
)
&&
N
==
c_m_n_grid_desc
.
GetLength
(
I1
)
&&
return
(
M
==
c_m_n_grid_desc
.
GetLength
(
I0
)
&&
N
==
c_m_n_grid_desc
.
GetLength
(
I1
)
&&
K0
==
b_k0_n_k1_grid_desc
.
GetLength
(
I0
)
&&
K0
==
b_k0_n_k1_grid_desc
.
GetLength
(
I0
)
&&
K1
==
a_k0_m_k1_grid_desc
.
GetLength
(
I2
)
&&
K1
==
b_k0_n_k1_grid_desc
.
GetLength
(
I2
))
&&
K1
==
b_k0_n_k1_grid_desc
.
GetLength
(
I2
))
&&
(
M
%
MPerBlockM1
==
0
&&
N
%
NPerBlockN1
==
0
&&
K0
%
KPerBlock
==
0
);
(
M
%
MPerBlockM1
==
0
&&
N
%
NPerBlockN1
==
0
&&
K0
%
KPerBlock
==
0
);
}
}
...
@@ -608,19 +608,6 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
...
@@ -608,19 +608,6 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v1r3
// output: register to global memory
// output: register to global memory
{
{
constexpr
auto
M11
=
Number
<
container_reduce
(
M11N11ThreadClusterM110Xs
{},
math
::
multiplies_v2
{},
I1
)
*
M1PerThreadM111
>
{};
constexpr
auto
N11
=
Number
<
container_reduce
(
M11N11ThreadClusterN110Xs
{},
math
::
multiplies_v2
{},
I1
)
*
N1PerThreadN111
>
{};
constexpr
index_t
M10
=
MPerBlockM1
/
M11
;
constexpr
index_t
N10
=
NPerBlockN1
/
N11
;
constexpr
index_t
M111
=
M1PerThreadM111
;
constexpr
index_t
N111
=
N1PerThreadN111
;
constexpr
auto
c_m0_m10_m11_n0_n10_n11_thread_desc
=
constexpr
auto
c_m0_m10_m11_n0_n10_n11_thread_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
I1
,
make_tuple
(
I1
,
...
...
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_dlops_v2.hpp
View file @
56fc0842
...
@@ -102,7 +102,6 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
...
@@ -102,7 +102,6 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
// divide block work by [M, N]
// divide block work by [M, N]
#if 0
#if 0
const auto k_block_work_num = K / Number<KPerBlock>{};
const auto ho_block_work_num = Ho / Number<HoPerBlock>{};
const auto ho_block_work_num = Ho / Number<HoPerBlock>{};
const auto wo_block_work_num = Wo / Number<WoPerBlock>{};
const auto wo_block_work_num = Wo / Number<WoPerBlock>{};
const auto hwo_block_work_num = ho_block_work_num * wo_block_work_num;
const auto hwo_block_work_num = ho_block_work_num * wo_block_work_num;
...
@@ -114,7 +113,6 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
...
@@ -114,7 +113,6 @@ struct GridwiseDynamicGemmDlops_km_kn_mn_v3
const index_t wo_block_work_id = hwo_block_work_id - ho_block_work_id * wo_block_work_num;
const index_t wo_block_work_id = hwo_block_work_id - ho_block_work_id * wo_block_work_num;
#else
#else
// Hack: this force result into SGPR
// Hack: this force result into SGPR
const
index_t
k_block_work_num
=
__builtin_amdgcn_readfirstlane
(
K
/
KPerBlock
);
const
index_t
ho_block_work_num
=
__builtin_amdgcn_readfirstlane
(
Ho
/
HoPerBlock
);
const
index_t
ho_block_work_num
=
__builtin_amdgcn_readfirstlane
(
Ho
/
HoPerBlock
);
const
index_t
wo_block_work_num
=
__builtin_amdgcn_readfirstlane
(
Wo
/
WoPerBlock
);
const
index_t
wo_block_work_num
=
__builtin_amdgcn_readfirstlane
(
Wo
/
WoPerBlock
);
const
index_t
hwo_block_work_num
=
ho_block_work_num
*
wo_block_work_num
;
const
index_t
hwo_block_work_num
=
ho_block_work_num
*
wo_block_work_num
;
...
...
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_xdlops_v2r3.hpp
View file @
56fc0842
...
@@ -269,11 +269,6 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
...
@@ -269,11 +269,6 @@ struct GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3
const
CM0M1M2NGridDesc
&
c_m0_m1_m2_n_grid_desc
,
const
CM0M1M2NGridDesc
&
c_m0_m1_m2_n_grid_desc
,
const
CBlockClusterAdaptor
&
c_block_cluster_adaptor
)
const
CBlockClusterAdaptor
&
c_block_cluster_adaptor
)
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
const
auto
a_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
const
auto
a_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_a_grid
,
a_k0_m_k1_grid_desc
.
GetElementSpaceSize
());
p_a_grid
,
a_k0_m_k1_grid_desc
.
GetElementSpaceSize
());
const
auto
b_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
const
auto
b_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
...
...
composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp
View file @
56fc0842
...
@@ -57,8 +57,6 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3
...
@@ -57,8 +57,6 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
E
=
ADesc
{}.
GetLength
(
I0
);
constexpr
auto
E
=
ADesc
{}.
GetLength
(
I0
);
constexpr
auto
K
=
ADesc
{}.
GetLength
(
I1
);
constexpr
auto
K
=
ADesc
{}.
GetLength
(
I1
);
...
...
host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp
View file @
56fc0842
...
@@ -34,12 +34,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
...
@@ -34,12 +34,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
constexpr
auto
I7
=
Number
<
7
>
{};
constexpr
auto
I8
=
Number
<
8
>
{};
DeviceMem
in_n_c_hi_wi_device_buf
(
sizeof
(
TInWei
)
*
in_n_c_hi_wi
.
mDesc
.
GetElementSpace
());
DeviceMem
in_n_c_hi_wi_device_buf
(
sizeof
(
TInWei
)
*
in_n_c_hi_wi
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_k_c_y_x_device_buf
(
sizeof
(
TInWei
)
*
wei_k_c_y_x
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_k_c_y_x_device_buf
(
sizeof
(
TInWei
)
*
wei_k_c_y_x
.
mDesc
.
GetElementSpace
());
...
@@ -198,8 +192,8 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
...
@@ -198,8 +192,8 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
in_gemmk_gemmn0_gemmn1_grid_move_slice_window_iterator_hacks
,
in_gemmk_gemmn0_gemmn1_grid_move_slice_window_iterator_hacks
,
nrepeat
);
nrepeat
);
float
perf
=
(
float
)
calculate_convolution_flops
(
float
perf
=
static_cast
<
float
>
(
calculate_convolution_flops
(
in_n_c_hi_wi_desc
,
wei_k_c_y_x_desc
,
out_n_k_ho_wo_desc
)
/
in_n_c_hi_wi_desc
,
wei_k_c_y_x_desc
,
out_n_k_ho_wo_desc
)
)
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
;
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
;
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
...
...
host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp
View file @
56fc0842
...
@@ -35,11 +35,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhw
...
@@ -35,11 +35,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhw
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
constexpr
auto
I7
=
Number
<
7
>
{};
constexpr
auto
I8
=
Number
<
8
>
{};
DeviceMem
in_n_hi_wi_c_device_buf
(
sizeof
(
TInWei
)
*
in_n_hi_wi_c
.
mDesc
.
GetElementSpace
());
DeviceMem
in_n_hi_wi_c_device_buf
(
sizeof
(
TInWei
)
*
in_n_hi_wi_c
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_k_y_x_c_device_buf
(
sizeof
(
TInWei
)
*
wei_k_y_x_c
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_k_y_x_c_device_buf
(
sizeof
(
TInWei
)
*
wei_k_y_x_c
.
mDesc
.
GetElementSpace
());
...
@@ -271,7 +266,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhw
...
@@ -271,7 +266,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhw
const
auto
Y
=
wei_k_y_x_c_lengths
[
I1
];
const
auto
Y
=
wei_k_y_x_c_lengths
[
I1
];
const
auto
X
=
wei_k_y_x_c_lengths
[
I2
];
const
auto
X
=
wei_k_y_x_c_lengths
[
I2
];
float
perf
=
(
float
)
(
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C
*
Y
*
X
)
/
float
perf
=
static_cast
<
float
>
(
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C
*
Y
*
X
)
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
;
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
;
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
...
...
host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp
View file @
56fc0842
...
@@ -34,12 +34,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
...
@@ -34,12 +34,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
constexpr
auto
I7
=
Number
<
7
>
{};
constexpr
auto
I8
=
Number
<
8
>
{};
DeviceMem
in_n_c_hi_wi_device_buf
(
sizeof
(
TInWei
)
*
in_n_c_hi_wi
.
mDesc
.
GetElementSpace
());
DeviceMem
in_n_c_hi_wi_device_buf
(
sizeof
(
TInWei
)
*
in_n_c_hi_wi
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_k_c_y_x_device_buf
(
sizeof
(
TInWei
)
*
wei_k_c_y_x
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_k_c_y_x_device_buf
(
sizeof
(
TInWei
)
*
wei_k_c_y_x
.
mDesc
.
GetElementSpace
());
...
@@ -194,8 +188,8 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
...
@@ -194,8 +188,8 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nk
in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks
,
in_gemmk0_gemmn_gemmk1_grid_move_slice_window_iterator_hacks
,
nrepeat
);
nrepeat
);
float
perf
=
(
float
)
calculate_convolution_flops
(
float
perf
=
static_cast
<
float
>
(
calculate_convolution_flops
(
in_n_c_hi_wi_desc
,
wei_k_c_y_x_desc
,
out_n_k_ho_wo_desc
)
/
in_n_c_hi_wi_desc
,
wei_k_c_y_x_desc
,
out_n_k_ho_wo_desc
)
)
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
;
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
;
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
...
...
host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
View file @
56fc0842
...
@@ -35,11 +35,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nh
...
@@ -35,11 +35,6 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nh
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
constexpr
auto
I7
=
Number
<
7
>
{};
constexpr
auto
I8
=
Number
<
8
>
{};
DeviceMem
in_n_hi_wi_c_device_buf
(
sizeof
(
TInWei
)
*
in_n_hi_wi_c
.
mDesc
.
GetElementSpace
());
DeviceMem
in_n_hi_wi_c_device_buf
(
sizeof
(
TInWei
)
*
in_n_hi_wi_c
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_k_y_x_c_device_buf
(
sizeof
(
TInWei
)
*
wei_k_y_x_c
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_k_y_x_c_device_buf
(
sizeof
(
TInWei
)
*
wei_k_y_x_c
.
mDesc
.
GetElementSpace
());
...
@@ -352,7 +347,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nh
...
@@ -352,7 +347,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nh
const
auto
Y
=
wei_k_y_x_c_lengths
[
I1
];
const
auto
Y
=
wei_k_y_x_c_lengths
[
I1
];
const
auto
X
=
wei_k_y_x_c_lengths
[
I2
];
const
auto
X
=
wei_k_y_x_c_lengths
[
I2
];
float
perf
=
(
float
)
(
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C
*
Y
*
X
)
/
float
perf
=
static_cast
<
float
>
(
(
std
::
size_t
(
2
)
*
N
*
K
*
Ho
*
Wo
*
C
*
Y
*
X
)
)
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
;
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
;
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
...
...
host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
View file @
56fc0842
...
@@ -26,7 +26,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
...
@@ -26,7 +26,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
const
Tensor
<
TInWei
>&
in_n_c_hi_wi
,
const
Tensor
<
TInWei
>&
in_n_c_hi_wi
,
const
Tensor
<
TInWei
>&
wei_k_c_y_x
,
const
Tensor
<
TInWei
>&
wei_k_c_y_x
,
Tensor
<
TOut
>&
out_n_k_ho_wo
,
Tensor
<
TOut
>&
out_n_k_ho_wo
,
ck
::
index_t
nrepeat
)
ck
::
index_t
/*
nrepeat
*/
)
{
{
using
namespace
ck
;
using
namespace
ck
;
...
...
host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
View file @
56fc0842
...
@@ -232,8 +232,8 @@ void device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
...
@@ -232,8 +232,8 @@ void device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
in_grid_move_slice_window_iterator_hacks
,
in_grid_move_slice_window_iterator_hacks
,
nrepeat
);
nrepeat
);
float
perf
=
(
float
)
calculate_convolution_flops
(
float
perf
=
static_cast
<
float
>
(
calculate_convolution_flops
(
in_desc_n_c_hi_wi
,
wei_desc_k_c_y_x
,
out_desc_n_k_ho_wo
)
/
in_desc_n_c_hi_wi
,
wei_desc_k_c_y_x
,
out_desc_n_k_ho_wo
)
)
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
;
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
;
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
...
...
host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
View file @
56fc0842
...
@@ -338,10 +338,11 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
...
@@ -338,10 +338,11 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
float
ave_time
=
timer
.
GetElapsedTime
()
/
nrepeat
;
float
ave_time
=
timer
.
GetElapsedTime
()
/
nrepeat
;
float
perf
=
(
float
)
calculate_convolution_flops
(
in_n_c_hi_wi_global_desc
,
float
perf
=
wei_k_c_y_x_global_desc
,
static_cast
<
float
>
(
calculate_convolution_flops
(
in_n_c_hi_wi_global_desc
,
out_n_k0_ho_wo_k1_global_desc
)
/
wei_k_c_y_x_global_desc
,
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
;
out_n_k0_ho_wo_k1_global_desc
))
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
;
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
<<
std
::
endl
;
...
...
host/driver_offline/include/driver_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
View file @
56fc0842
...
@@ -354,10 +354,11 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
...
@@ -354,10 +354,11 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
float
ave_time
=
timer
.
GetElapsedTime
()
/
nrepeat
;
float
ave_time
=
timer
.
GetElapsedTime
()
/
nrepeat
;
float
perf
=
(
float
)
calculate_convolution_flops
(
in_n_c_hi_wi_global_desc
,
float
perf
=
wei_k_c_y_x_global_desc
,
static_cast
<
float
>
(
calculate_convolution_flops
(
in_n_c_hi_wi_global_desc
,
out_n_k0_ho_wo_k1_global_desc
)
/
wei_k_c_y_x_global_desc
,
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
;
out_n_k0_ho_wo_k1_global_desc
))
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
;
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
<<
std
::
endl
;
...
...
host/driver_offline/src/conv_bwd_driver_offline.cpp
View file @
56fc0842
...
@@ -128,10 +128,8 @@ int main(int argc, char* argv[])
...
@@ -128,10 +128,8 @@ int main(int argc, char* argv[])
std
::
vector
<
std
::
size_t
>
in_lengths_host
(
4
),
wei_lengths_host
(
4
),
out_lengths_host
(
4
);
std
::
vector
<
std
::
size_t
>
in_lengths_host
(
4
),
wei_lengths_host
(
4
),
out_lengths_host
(
4
);
switch
(
layout
)
if
(
layout
==
ConvTensorLayout
::
NCHW
)
{
{
case
ConvTensorLayout
::
NCHW
:
// NCHW
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
C
);
in_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Hi
);
in_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Hi
);
...
@@ -144,9 +142,9 @@ int main(int argc, char* argv[])
...
@@ -144,9 +142,9 @@ int main(int argc, char* argv[])
out_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K
);
out_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
K
);
out_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Ho
);
out_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Ho
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
break
;
}
case
ConvTensorLayout
::
NHWC
:
else
if
(
layout
==
ConvTensorLayout
::
NHWC
)
// NHWC
{
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
0
]
=
static_cast
<
std
::
size_t
>
(
N
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Hi
);
in_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Hi
);
in_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Wi
);
in_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Wi
);
...
@@ -159,8 +157,10 @@ int main(int argc, char* argv[])
...
@@ -159,8 +157,10 @@ int main(int argc, char* argv[])
out_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Ho
);
out_lengths_host
[
1
]
=
static_cast
<
std
::
size_t
>
(
Ho
);
out_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
out_lengths_host
[
2
]
=
static_cast
<
std
::
size_t
>
(
Wo
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
K
);
out_lengths_host
[
3
]
=
static_cast
<
std
::
size_t
>
(
K
);
break
;
}
default:
throw
std
::
runtime_error
(
"wrong! not implemented"
);
else
{
throw
std
::
runtime_error
(
"wrong! not implemented"
);
}
}
Tensor
<
in_data_t
>
in_host
(
in_lengths_host
);
Tensor
<
in_data_t
>
in_host
(
in_lengths_host
);
...
...
host/driver_offline/src/conv_fwd_driver_offline.cpp
View file @
56fc0842
...
@@ -467,7 +467,6 @@ int main(int argc, char* argv[])
...
@@ -467,7 +467,6 @@ int main(int argc, char* argv[])
check_error
(
out_host
,
out_device
);
check_error
(
out_host
,
out_device
);
#if 0
if
(
do_log
)
if
(
do_log
)
{
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"in : "
,
in
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"in : "
,
in
.
mData
,
","
)
<<
std
::
endl
;
...
@@ -475,6 +474,5 @@ int main(int argc, char* argv[])
...
@@ -475,6 +474,5 @@ int main(int argc, char* argv[])
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_host : "
,
out_host
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_host : "
,
out_host
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_device: "
,
out_device
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"out_device: "
,
out_device
.
mData
,
","
)
<<
std
::
endl
;
}
}
#endif
}
}
}
}
host/host_tensor/include/conv_common.hpp
View file @
56fc0842
...
@@ -62,7 +62,7 @@ constexpr auto get_convolution_output_default_4d_tensor_descriptor(
...
@@ -62,7 +62,7 @@ constexpr auto get_convolution_output_default_4d_tensor_descriptor(
template
<
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
template
<
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
constexpr
std
::
size_t
constexpr
std
::
size_t
calculate_convolution_flops
(
const
InDesc
&
in_desc
,
const
WeiDesc
&
wei_desc
,
const
OutDesc
&
out_desc
)
calculate_convolution_flops
(
const
InDesc
&
,
const
WeiDesc
&
wei_desc
,
const
OutDesc
&
out_desc
)
{
{
using
namespace
ck
;
using
namespace
ck
;
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment