Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
7e87e0b3
Commit
7e87e0b3
authored
Sep 15, 2021
by
Jing Zhang
Browse files
tuning
parent
f87dddae
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
53 additions
and
55 deletions
+53
-55
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
...ernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
+24
-24
host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
...ution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+3
-3
host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
...ution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+16
-18
host/driver_offline/src/conv_fwd_driver_offline.cpp
host/driver_offline/src/conv_fwd_driver_offline.cpp
+10
-10
No files found.
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
View file @
7e87e0b3
...
@@ -114,7 +114,7 @@ template <index_t BlockSize,
...
@@ -114,7 +114,7 @@ template <index_t BlockSize,
index_t
KPerBlock
,
index_t
KPerBlock
,
index_t
HoPerBlock
,
index_t
HoPerBlock
,
index_t
WoPerBlock
,
index_t
WoPerBlock
,
index_t
EPerBlock
,
index_t
E
1
PerBlock
,
index_t
KPerThread
,
index_t
KPerThread
,
index_t
HoPerThread
,
index_t
HoPerThread
,
index_t
WoPerThread
,
index_t
WoPerThread
,
...
@@ -185,7 +185,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
...
@@ -185,7 +185,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
auto
c_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
auto
c_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_c_global
,
c_k_n_ho_wo_global_desc
.
GetElementSpaceSize
());
p_c_global
,
c_k_n_ho_wo_global_desc
.
GetElementSpaceSize
());
static_assert
(
E1
%
EPerBlock
==
0
,
""
);
static_assert
(
E1
%
E
1
PerBlock
==
0
,
""
);
// const auto E = a_e0_e1_k_e2_global_desc.GetLength(I0);
// const auto E = a_e0_e1_k_e2_global_desc.GetLength(I0);
// const auto K = a_e0_e1_k_e2_global_desc.GetLength(I1);
// const auto K = a_e0_e1_k_e2_global_desc.GetLength(I1);
...
@@ -229,10 +229,9 @@ struct GridwiseGemmDlops_km_kn_mn_v3
...
@@ -229,10 +229,9 @@ struct GridwiseGemmDlops_km_kn_mn_v3
make_tuple
(
Number
<
I1
>
{},
Number
<
E1
>
{},
Number
<
KPerBlock
>
{},
Number
<
E2
>
{}),
make_tuple
(
Number
<
I1
>
{},
Number
<
E1
>
{},
Number
<
KPerBlock
>
{},
Number
<
E2
>
{}),
max_lds_align
);
max_lds_align
);
// B matrix in LDS memory, dst of blockwise copy
// B matrix in thread, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
b_e1_n_ho_wo_e2_block_desc
=
constexpr
auto
b_e1_n_ho_wo_e2_block_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
EPerBlock
>
{},
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
E
1
PerBlock
>
{},
Number
<
1
>
{},
Number
<
1
>
{},
Number
<
HoPerBlock
>
{},
Number
<
HoPerBlock
>
{},
Number
<
WoPerBlock
>
{},
Number
<
WoPerBlock
>
{},
...
@@ -244,7 +243,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
...
@@ -244,7 +243,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
Number
<
KPerThread
>
{},
Number
<
1
>
{},
Number
<
HoPerThread
>
{},
Number
<
WoPerThread
>
{}));
Number
<
KPerThread
>
{},
Number
<
1
>
{},
Number
<
HoPerThread
>
{},
Number
<
WoPerThread
>
{}));
constexpr
auto
a_e1_k_e2_block_desc
=
make_naive_tensor_descriptor_aligned
(
constexpr
auto
a_e1_k_e2_block_desc
=
make_naive_tensor_descriptor_aligned
(
make_tuple
(
Number
<
EPerBlock
>
{},
Number
<
KPerBlock
>
{},
Number
<
E2
>
{}),
max_lds_align
);
make_tuple
(
Number
<
E
1
PerBlock
>
{},
Number
<
KPerBlock
>
{},
Number
<
E2
>
{}),
max_lds_align
);
auto
blockwise_gemm
=
auto
blockwise_gemm
=
BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
<
BlockSize
,
BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
<
BlockSize
,
...
@@ -295,16 +294,17 @@ struct GridwiseGemmDlops_km_kn_mn_v3
...
@@ -295,16 +294,17 @@ struct GridwiseGemmDlops_km_kn_mn_v3
1
,
1
,
1
,
1
,
AThreadTransferSrcResetCoordinateAfterRun
,
AThreadTransferSrcResetCoordinateAfterRun
,
true
>
(
a_e0_e1_k_e2_global_desc
,
false
>
(
make_multi_index
(
0
,
0
,
k_block_data_on_global
,
0
),
a_e0_e1_k_e2_global_desc
,
a_e0_e1_k_e2_block_desc
,
make_multi_index
(
0
,
0
,
k_block_data_on_global
,
0
),
make_multi_index
(
0
,
0
,
0
,
0
));
a_e0_e1_k_e2_block_desc
,
make_multi_index
(
0
,
0
,
0
,
0
));
constexpr
auto
a_block_slice_copy_step
=
make_multi_index
(
I1
,
0
,
0
,
0
);
constexpr
auto
a_block_slice_copy_step
=
make_multi_index
(
I1
,
0
,
0
,
0
);
constexpr
auto
b_e0_e1_n_ho_wo_e2_thread_desc
=
constexpr
auto
b_e0_e1_n_ho_wo_e2_thread_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
I1
,
make_naive_tensor_descriptor_packed
(
make_tuple
(
I1
,
Number
<
EPerBlock
>
{},
Number
<
E
1
PerBlock
>
{},
Number
<
1
>
{},
Number
<
1
>
{},
Number
<
HoPerThread
>
{},
Number
<
HoPerThread
>
{},
Number
<
WoPerThread
>
{},
Number
<
WoPerThread
>
{},
...
@@ -315,7 +315,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
...
@@ -315,7 +315,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
FloatAB
,
FloatAB
,
decltype
(
b_e0_e1_n_ho_wo_e2_global_desc
),
decltype
(
b_e0_e1_n_ho_wo_e2_global_desc
),
decltype
(
b_e0_e1_n_ho_wo_e2_thread_desc
),
decltype
(
b_e0_e1_n_ho_wo_e2_thread_desc
),
Sequence
<
I1
,
EPerBlock
,
1
,
HoPerThread
,
WoPerThread
,
E2
>
,
Sequence
<
I1
,
E
1
PerBlock
,
1
,
HoPerThread
,
WoPerThread
,
E2
>
,
BBlockTransferSrcAccessOrder
,
BBlockTransferSrcAccessOrder
,
BBlockTransferSrcVectorDim
,
BBlockTransferSrcVectorDim
,
BBlockTransferSrcScalarPerVector
,
BBlockTransferSrcScalarPerVector
,
...
@@ -344,7 +344,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
...
@@ -344,7 +344,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
Sequence
<
KPerThread
,
1
,
HoPerThread
,
WoPerThread
>>
{}
Sequence
<
KPerThread
,
1
,
HoPerThread
,
WoPerThread
>>
{}
.
Run
(
c_k_n_ho_wo_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
c_thread_buf
,
FloatAcc
{
0
});
.
Run
(
c_k_n_ho_wo_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
c_thread_buf
,
FloatAcc
{
0
});
constexpr
auto
b_thread_slice_copy_step
=
make_multi_index
(
0
,
EPerBlock
,
0
,
0
,
0
,
0
);
constexpr
auto
b_thread_slice_copy_step
=
make_multi_index
(
0
,
E
1
PerBlock
,
0
,
0
,
0
,
0
);
// hack to control index calculation when iterating over A and B matrix for threadwise copy
// hack to control index calculation when iterating over A and B matrix for threadwise copy
constexpr
auto
a_e0_e1_k_e2_global_step_hacks
=
AGlobalStepHacks
{};
constexpr
auto
a_e0_e1_k_e2_global_step_hacks
=
AGlobalStepHacks
{};
...
@@ -407,7 +407,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
...
@@ -407,7 +407,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
// LDS double buffer: GEMM on current data
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_even_buf
,
c_thread_buf
);
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_even_buf
,
c_thread_buf
);
blockwise_gemm
.
MoveABlockSliceWindow
(
make_tuple
(
EPerBlock
,
0
,
0
));
blockwise_gemm
.
MoveABlockSliceWindow
(
make_tuple
(
E
1
PerBlock
,
0
,
0
));
b_threadwise_transfer
.
MoveSrcSliceWindow
(
b_e0_e1_n_ho_wo_e2_global_desc
,
b_threadwise_transfer
.
MoveSrcSliceWindow
(
b_e0_e1_n_ho_wo_e2_global_desc
,
b_thread_slice_copy_step
,
b_thread_slice_copy_step
,
...
@@ -423,11 +423,11 @@ struct GridwiseGemmDlops_km_kn_mn_v3
...
@@ -423,11 +423,11 @@ struct GridwiseGemmDlops_km_kn_mn_v3
// LDS double buffer: GEMM on current data
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_odd_buf
,
c_thread_buf
);
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_odd_buf
,
c_thread_buf
);
blockwise_gemm
.
MoveABlockSliceWindow
(
make_tuple
(
EPerBlock
,
0
,
0
));
blockwise_gemm
.
MoveABlockSliceWindow
(
make_tuple
(
E
1
PerBlock
,
0
,
0
));
e1_block_data_begin
+=
2
*
EPerBlock
;
e1_block_data_begin
+=
2
*
E
1
PerBlock
;
}
while
(
e1_block_data_begin
<
E1
-
2
*
EPerBlock
);
}
while
(
e1_block_data_begin
<
E1
-
2
*
E
1
PerBlock
);
}
}
// LDS double buffer: tail
// LDS double buffer: tail
...
@@ -447,7 +447,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
...
@@ -447,7 +447,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
// LDS double buffer: GEMM on 2nd-last data
// LDS double buffer: GEMM on 2nd-last data
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_even_buf
,
c_thread_buf
);
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_even_buf
,
c_thread_buf
);
blockwise_gemm
.
MoveABlockSliceWindow
(
make_tuple
(
EPerBlock
,
0
,
0
));
blockwise_gemm
.
MoveABlockSliceWindow
(
make_tuple
(
E
1
PerBlock
,
0
,
0
));
// LDS double buffer: GEMM on last data
// LDS double buffer: GEMM on last data
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_odd_buf
,
c_thread_buf
);
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_odd_buf
,
c_thread_buf
);
...
@@ -462,7 +462,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
...
@@ -462,7 +462,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
a_block_slice_copy_step
,
a_block_slice_copy_step
,
AGlobalMoveSliceWindowStepHacks
{});
AGlobalMoveSliceWindowStepHacks
{});
blockwise_gemm
.
MoveABlockSliceWindow
(
make_tuple
(
-
(
E1
-
EPerBlock
),
0
,
0
));
blockwise_gemm
.
MoveABlockSliceWindow
(
make_tuple
(
-
(
E1
-
E
1
PerBlock
),
0
,
0
));
b_threadwise_transfer
.
MoveSrcSliceWindow
(
b_e0_e1_n_ho_wo_e2_global_desc
,
b_threadwise_transfer
.
MoveSrcSliceWindow
(
b_e0_e1_n_ho_wo_e2_global_desc
,
b_thread_slice_copy_step
,
b_thread_slice_copy_step
,
...
@@ -514,7 +514,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
...
@@ -514,7 +514,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
// LDS double buffer: GEMM on current data
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_even_buf
,
c_thread_buf
);
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_even_buf
,
c_thread_buf
);
blockwise_gemm
.
MoveABlockSliceWindow
(
make_tuple
(
EPerBlock
,
0
,
0
));
blockwise_gemm
.
MoveABlockSliceWindow
(
make_tuple
(
E
1
PerBlock
,
0
,
0
));
b_threadwise_transfer
.
MoveSrcSliceWindow
(
b_e0_e1_n_ho_wo_e2_global_desc
,
b_threadwise_transfer
.
MoveSrcSliceWindow
(
b_e0_e1_n_ho_wo_e2_global_desc
,
b_thread_slice_copy_step
,
b_thread_slice_copy_step
,
...
@@ -530,11 +530,11 @@ struct GridwiseGemmDlops_km_kn_mn_v3
...
@@ -530,11 +530,11 @@ struct GridwiseGemmDlops_km_kn_mn_v3
// LDS double buffer: GEMM on current data
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_odd_buf
,
c_thread_buf
);
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_odd_buf
,
c_thread_buf
);
blockwise_gemm
.
MoveABlockSliceWindow
(
make_tuple
(
EPerBlock
,
0
,
0
));
blockwise_gemm
.
MoveABlockSliceWindow
(
make_tuple
(
E
1
PerBlock
,
0
,
0
));
e1_block_data_begin
+=
2
*
EPerBlock
;
e1_block_data_begin
+=
2
*
E
1
PerBlock
;
}
while
(
e1_block_data_begin
<
E1
-
2
*
EPerBlock
);
}
while
(
e1_block_data_begin
<
E1
-
2
*
E
1
PerBlock
);
}
}
// LDS double buffer: tail
// LDS double buffer: tail
...
@@ -554,7 +554,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
...
@@ -554,7 +554,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
// LDS double buffer: GEMM on 2nd-last data
// LDS double buffer: GEMM on 2nd-last data
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_even_buf
,
c_thread_buf
);
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_even_buf
,
c_thread_buf
);
blockwise_gemm
.
MoveABlockSliceWindow
(
make_tuple
(
EPerBlock
,
0
,
0
));
blockwise_gemm
.
MoveABlockSliceWindow
(
make_tuple
(
E
1
PerBlock
,
0
,
0
));
// LDS double buffer: GEMM on last data
// LDS double buffer: GEMM on last data
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_odd_buf
,
c_thread_buf
);
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_odd_buf
,
c_thread_buf
);
...
...
host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
View file @
7e87e0b3
...
@@ -48,7 +48,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
...
@@ -48,7 +48,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
const
auto
Y
=
wei_k_c_y_x_lengths
[
I2
];
const
auto
Y
=
wei_k_c_y_x_lengths
[
I2
];
const
auto
X
=
wei_k_c_y_x_lengths
[
I3
];
const
auto
X
=
wei_k_c_y_x_lengths
[
I3
];
constexpr
auto
InWeiVectorSize
=
4
;
constexpr
auto
InWeiVectorSize
=
8
;
#if 1
#if 1
const
auto
C0
=
C
/
Number
<
InWeiVectorSize
>
{};
const
auto
C0
=
C
/
Number
<
InWeiVectorSize
>
{};
...
@@ -106,9 +106,9 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
...
@@ -106,9 +106,9 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
constexpr
index_t
HoPerBlock
=
8
;
constexpr
index_t
HoPerBlock
=
8
;
constexpr
index_t
WoPerBlock
=
32
;
constexpr
index_t
WoPerBlock
=
32
;
constexpr
index_t
E1
=
4
*
9
;
constexpr
index_t
E1
=
2
*
9
;
constexpr
index_t
E2
=
C1
;
constexpr
index_t
E2
=
C1
;
constexpr
index_t
EPerBlock
=
4
;
constexpr
index_t
EPerBlock
=
2
;
constexpr
index_t
KPerThread
=
KPerBlock
;
constexpr
index_t
KPerThread
=
KPerBlock
;
constexpr
index_t
HoPerThread
=
2
;
constexpr
index_t
HoPerThread
=
2
;
...
...
host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
View file @
7e87e0b3
...
@@ -10,8 +10,8 @@ template <ck::index_t BlockSize,
...
@@ -10,8 +10,8 @@ template <ck::index_t BlockSize,
typename
FloatAB
,
typename
FloatAB
,
typename
FloatAcc
,
typename
FloatAcc
,
typename
FloatC
,
typename
FloatC
,
ck
::
index_t
E1
,
ck
::
index_t
E1
_
,
ck
::
index_t
E2
,
ck
::
index_t
E2
_
,
ck
::
index_t
KPerBlock
,
ck
::
index_t
KPerBlock
,
ck
::
index_t
HoPerBlock
,
ck
::
index_t
HoPerBlock
,
ck
::
index_t
WoPerBlock
,
ck
::
index_t
WoPerBlock
,
...
@@ -95,12 +95,15 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
...
@@ -95,12 +95,15 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
const
auto
E
=
C0
*
Y
*
X
;
const
auto
E
=
C0
*
Y
*
X
;
constexpr
auto
E1
=
Number
<
E1_
>
{};
constexpr
auto
E2
=
Number
<
E2_
>
{};
static_assert
(
E2
==
C1
,
""
);
static_assert
(
E2
==
C1
,
""
);
const
auto
E0
=
E
/
E1
;
const
auto
E0
=
E
/
E1
;
// weight tensor
// weight tensor
const
auto
a_e
0
_k_e2_grid_desc
=
transform_tensor_descriptor
(
const
auto
a_e_k_e2_grid_desc
=
transform_tensor_descriptor
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
K
,
C0
*
Y
*
X
,
E2
)),
make_naive_tensor_descriptor_packed
(
make_tuple
(
K
,
C0
*
Y
*
X
,
E2
)),
make_tuple
(
make_pass_through_transform
(
K
),
make_tuple
(
make_pass_through_transform
(
K
),
make_pass_through_transform
(
C0
*
Y
*
X
),
make_pass_through_transform
(
C0
*
Y
*
X
),
...
@@ -109,7 +112,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
...
@@ -109,7 +112,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{},
Sequence
<
2
>
{}));
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{},
Sequence
<
2
>
{}));
const
auto
a_e0_e1_k_e2_grid_desc
=
const
auto
a_e0_e1_k_e2_grid_desc
=
transform_tensor_descriptor
(
a_e
0
_k_e2_grid_desc
,
transform_tensor_descriptor
(
a_e_k_e2_grid_desc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
E0
,
E1
)),
make_tuple
(
make_unmerge_transform
(
make_tuple
(
E0
,
E1
)),
make_pass_through_transform
(
K
),
make_pass_through_transform
(
K
),
make_pass_through_transform
(
E2
)),
make_pass_through_transform
(
E2
)),
...
@@ -139,7 +142,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
...
@@ -139,7 +142,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
make_tuple
(
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{},
Sequence
<
6
>
{}));
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{},
Sequence
<
6
>
{}));
const
auto
b_e
0
_n_ho_wo_e2_grid_desc
=
transform_tensor_descriptor
(
const
auto
b_e_n_ho_wo_e2_grid_desc
=
transform_tensor_descriptor
(
in_n_c0_y_ho_x_wo_c1_global_desc
,
in_n_c0_y_ho_x_wo_c1_global_desc
,
make_tuple
(
make_merge_transform
(
make_tuple
(
C0
,
Y
,
X
)),
make_tuple
(
make_merge_transform
(
make_tuple
(
C0
,
Y
,
X
)),
make_pass_through_transform
(
N
),
make_pass_through_transform
(
N
),
...
@@ -151,7 +154,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
...
@@ -151,7 +154,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}));
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}));
const
auto
b_e0_e1_n_ho_wo_e2_grid_desc
=
transform_tensor_descriptor
(
const
auto
b_e0_e1_n_ho_wo_e2_grid_desc
=
transform_tensor_descriptor
(
b_e
0
_n_ho_wo_e2_grid_desc
,
b_e_n_ho_wo_e2_grid_desc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
E0
,
E1
)),
make_tuple
(
make_unmerge_transform
(
make_tuple
(
E0
,
E1
)),
make_pass_through_transform
(
N
),
make_pass_through_transform
(
N
),
make_pass_through_transform
(
Hop
),
make_pass_through_transform
(
Hop
),
...
@@ -199,13 +202,13 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
...
@@ -199,13 +202,13 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}));
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}));
constexpr
auto
b_e0_e1_n_ho_wo_e2_global_move_slice_window_step_hack
=
constexpr
auto
b_e0_e1_n_ho_wo_e2_global_move_slice_window_step_hack
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{};
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{};
...
@@ -245,17 +248,17 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
...
@@ -245,17 +248,17 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
ABlockTransferThreadSliceLengths_E0_E1_K_E2
,
ABlockTransferThreadSliceLengths_E0_E1_K_E2
,
ABlockTransferThreadClusterLengths_E0_E1_K_E2
,
ABlockTransferThreadClusterLengths_E0_E1_K_E2
,
Sequence
<
2
,
0
,
1
,
3
>
,
Sequence
<
2
,
0
,
1
,
3
>
,
Sequence
<
2
,
0
,
1
,
3
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
3
,
3
,
ABlockTransferSrcScalarPerVector_E2
,
ABlockTransferSrcScalarPerVector_E2
,
ABlockTransferDstScalarPerVector_E2
,
ABlockTransferDstScalarPerVector_E2
,
false
,
// don't move back src coordinate after threadwise copy
false
,
// don't move back src coordinate after threadwise copy
Sequence
<
0
,
2
,
3
,
4
,
1
,
5
>
,
Sequence
<
0
,
1
,
2
,
3
,
4
,
5
>
,
5
,
5
,
BThreadTransferSrcScalarPerVector_E2
,
BThreadTransferSrcScalarPerVector_E2
,
false
,
// don't move back src coordinate after threadwise copy, which will be fused with
false
,
// don't move back src coordinate after threadwise copy, which will be fused with
// MoveSrcSliceWindow() to save addr computation
// MoveSrcSliceWindow() to save addr computation
Sequence
<
2
,
3
,
1
,
0
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
0
,
0
,
CThreadTransferDstScalarPerVector_K
,
CThreadTransferDstScalarPerVector_K
,
decltype
(
a_e0_e1_k_e2_global_step_hacks
),
decltype
(
a_e0_e1_k_e2_global_step_hacks
),
...
@@ -286,9 +289,9 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
...
@@ -286,9 +289,9 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
using
CBlockIdToBlockClusterAdaptor_K_N_Ho_Wo
=
using
CBlockIdToBlockClusterAdaptor_K_N_Ho_Wo
=
decltype
(
c_blockid_to_k_n_ho_wo_block_cluster_adaptor
);
decltype
(
c_blockid_to_k_n_ho_wo_block_cluster_adaptor
);
#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
float
ave_time
=
0
;
float
ave_time
=
0
;
#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
if
(
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
if
(
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
{
const
auto
kernel
=
const
auto
kernel
=
...
@@ -393,8 +396,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
...
@@ -393,8 +396,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
c_k_n_hop_wop_grid_desc
,
c_k_n_hop_wop_grid_desc
,
c_blockid_to_k_n_ho_wo_block_cluster_adaptor
);
c_blockid_to_k_n_ho_wo_block_cluster_adaptor
);
}
}
return
ave_time
;
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
DeviceMem
a_e0_e1_k_e2_grid_desc_dev_buf
(
sizeof
(
AGridDesc_E0_E1_K_E2
));
DeviceMem
a_e0_e1_k_e2_grid_desc_dev_buf
(
sizeof
(
AGridDesc_E0_E1_K_E2
));
DeviceMem
b_e0_e1_n_ho_wo_e2_grid_desc_dev_buf
(
sizeof
(
BGridDesc_E0_E1_N_Ho_Wo_E2
));
DeviceMem
b_e0_e1_n_ho_wo_e2_grid_desc_dev_buf
(
sizeof
(
BGridDesc_E0_E1_N_Ho_Wo_E2
));
...
@@ -408,8 +409,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
...
@@ -408,8 +409,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
c_blockid_to_k_n_ho_wo_block_cluster_adaptor_dev_buf
.
ToDevice
(
c_blockid_to_k_n_ho_wo_block_cluster_adaptor_dev_buf
.
ToDevice
(
&
c_blockid_to_k_n_ho_wo_block_cluster_adaptor
);
&
c_blockid_to_k_n_ho_wo_block_cluster_adaptor
);
float
ave_time
=
0
;
if
(
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
if
(
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
{
const
auto
kernel
=
const
auto
kernel
=
...
@@ -534,9 +533,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
...
@@ -534,9 +533,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
cast_pointer_to_constant_address_space
(
cast_pointer_to_constant_address_space
(
c_blockid_to_k_n_ho_wo_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
()));
c_blockid_to_k_n_ho_wo_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
()));
}
}
return
ave_time
;
#endif
#endif
return
ave_time
;
}
}
};
};
#endif
#endif
host/driver_offline/src/conv_fwd_driver_offline.cpp
View file @
7e87e0b3
...
@@ -20,12 +20,12 @@
...
@@ -20,12 +20,12 @@
#include "device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
#include "device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
#include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
#include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
#define USE_DYNAMIC_MODE
1
#define USE_DYNAMIC_MODE
0
#define USE_CONV_FWD_V4R4_NCHW 0
#define USE_CONV_FWD_V4R4_NCHW 0
#define USE_CONV_FWD_V4R4R2_NHWC 0
#define USE_CONV_FWD_V4R4R2_NHWC 0
#define USE_CONV_FWD_V6R1_NCHW 0
#define USE_CONV_FWD_V6R1_NCHW 0
#define USE_CONV_FWD_V5R1_NHWC
1
#define USE_CONV_FWD_V5R1_NHWC
0
#define USE_CONV_FWD_V5R1_NCHWC
0
#define USE_CONV_FWD_V5R1_NCHWC
1
#define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
#define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
#define USE_CONV_FWD_V4R4R4_XDL_NHWC 0
#define USE_CONV_FWD_V4R4R4_XDL_NHWC 0
...
@@ -105,16 +105,16 @@ int main(int argc, char* argv[])
...
@@ -105,16 +105,16 @@ int main(int argc, char* argv[])
const
bool
do_log
=
std
::
stoi
(
argv
[
5
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
5
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
6
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
6
]);
constexpr
auto
N
=
Number
<
1
28
>
{};
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
C
=
Number
<
1
92
>
{};
constexpr
auto
C
=
Number
<
1
6
>
{};
constexpr
auto
Hi
=
Number
<
7
1
>
{};
constexpr
auto
Hi
=
Number
<
1
080
>
{};
constexpr
auto
Wi
=
Number
<
7
1
>
{};
constexpr
auto
Wi
=
Number
<
1
920
>
{};
constexpr
auto
K
=
Number
<
25
6
>
{};
constexpr
auto
K
=
Number
<
1
6
>
{};
constexpr
auto
Y
=
Number
<
3
>
{};
constexpr
auto
Y
=
Number
<
3
>
{};
constexpr
auto
X
=
Number
<
3
>
{};
constexpr
auto
X
=
Number
<
3
>
{};
constexpr
auto
conv_stride_h
=
I
2
;
constexpr
auto
conv_stride_h
=
I
1
;
constexpr
auto
conv_stride_w
=
I
2
;
constexpr
auto
conv_stride_w
=
I
1
;
constexpr
auto
conv_dilation_h
=
I1
;
constexpr
auto
conv_dilation_h
=
I1
;
constexpr
auto
conv_dilation_w
=
I1
;
constexpr
auto
conv_dilation_w
=
I1
;
constexpr
auto
in_left_pad_h
=
I1
;
constexpr
auto
in_left_pad_h
=
I1
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment