Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
f87dddae
Commit
f87dddae
authored
Sep 15, 2021
by
Jing Zhang
Browse files
add BGlobalMoveSliceWindowStepHacks{}
parent
31a440b9
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
42 additions
and
16 deletions
+42
-16
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
...ernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
+14
-7
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
...ude/tensor_operation/threadwise_tensor_slice_transfer.hpp
+19
-0
host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nhwc_kyxc_nhwk.hpp
...ution_forward_implicit_gemm_v5r1_dlops_nhwc_kyxc_nhwk.hpp
+5
-5
host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nhwc_kyxc_nhwk.hpp
...ution_forward_implicit_gemm_v5r1_dlops_nhwc_kyxc_nhwk.hpp
+4
-4
No files found.
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
View file @
f87dddae
...
...
@@ -394,7 +394,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
{
// even iteration
b_threadwise_transfer
.
MoveSrcSliceWindow
(
b_e0_e1_n_ho_wo_e2_global_desc
,
b_thread_slice_copy_step
);
b_thread_slice_copy_step
,
BGlobalMoveSliceWindowStepHacks
{});
b_threadwise_transfer
.
Run
(
b_e0_e1_n_ho_wo_e2_global_desc
,
b_global_buf
,
...
...
@@ -409,7 +410,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
blockwise_gemm
.
MoveABlockSliceWindow
(
make_tuple
(
EPerBlock
,
0
,
0
));
b_threadwise_transfer
.
MoveSrcSliceWindow
(
b_e0_e1_n_ho_wo_e2_global_desc
,
b_thread_slice_copy_step
);
b_thread_slice_copy_step
,
BGlobalMoveSliceWindowStepHacks
{});
b_threadwise_transfer
.
Run
(
b_e0_e1_n_ho_wo_e2_global_desc
,
b_global_buf
,
...
...
@@ -432,7 +434,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
if
constexpr
(
HasDoubleTailE1BlockLoop
)
// if has 2 iteration left
{
b_threadwise_transfer
.
MoveSrcSliceWindow
(
b_e0_e1_n_ho_wo_e2_global_desc
,
b_thread_slice_copy_step
);
b_thread_slice_copy_step
,
BGlobalMoveSliceWindowStepHacks
{});
b_threadwise_transfer
.
Run
(
b_e0_e1_n_ho_wo_e2_global_desc
,
b_global_buf
,
...
...
@@ -462,7 +465,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
blockwise_gemm
.
MoveABlockSliceWindow
(
make_tuple
(
-
(
E1
-
EPerBlock
),
0
,
0
));
b_threadwise_transfer
.
MoveSrcSliceWindow
(
b_e0_e1_n_ho_wo_e2_global_desc
,
b_thread_slice_copy_step
);
b_thread_slice_copy_step
,
BGlobalMoveSliceWindowStepHacks
{});
e0_block_data_begin
+=
1
;
...
...
@@ -497,7 +501,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
{
// even iteration
b_threadwise_transfer
.
MoveSrcSliceWindow
(
b_e0_e1_n_ho_wo_e2_global_desc
,
b_thread_slice_copy_step
);
b_thread_slice_copy_step
,
BGlobalMoveSliceWindowStepHacks
{});
b_threadwise_transfer
.
Run
(
b_e0_e1_n_ho_wo_e2_global_desc
,
b_global_buf
,
...
...
@@ -512,7 +517,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
blockwise_gemm
.
MoveABlockSliceWindow
(
make_tuple
(
EPerBlock
,
0
,
0
));
b_threadwise_transfer
.
MoveSrcSliceWindow
(
b_e0_e1_n_ho_wo_e2_global_desc
,
b_thread_slice_copy_step
);
b_thread_slice_copy_step
,
BGlobalMoveSliceWindowStepHacks
{});
b_threadwise_transfer
.
Run
(
b_e0_e1_n_ho_wo_e2_global_desc
,
b_global_buf
,
...
...
@@ -535,7 +541,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
if
constexpr
(
HasDoubleTailE1BlockLoop
)
// if has 2 iteration left
{
b_threadwise_transfer
.
MoveSrcSliceWindow
(
b_e0_e1_n_ho_wo_e2_global_desc
,
b_thread_slice_copy_step
);
b_thread_slice_copy_step
,
BGlobalMoveSliceWindowStepHacks
{});
b_threadwise_transfer
.
Run
(
b_e0_e1_n_ho_wo_e2_global_desc
,
b_global_buf
,
...
...
composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
View file @
f87dddae
...
...
@@ -666,6 +666,25 @@ struct ThreadwiseTensorSliceTransfer_v2
move_tensor_coordinate
(
src_desc
,
src_coord_
,
adjusted_step
);
}
// src_slice_origin_step_idx need to be known at compile-time, for performance reason
template
<
typename
SrcMoveSliceWindowStepHack
>
__device__
void
MoveSrcSliceWindow
(
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin_step_idx
,
const
SrcMoveSliceWindowStepHack
&
src_move_slice_window_step_hack
)
{
// if src coord was not reset by RunRead(), then need to adjust the step here
const
auto
adjusted_step_idx
=
SrcResetCoordinateAfterRun
?
src_slice_origin_step_idx
:
src_slice_origin_step_idx
+
GetSrcCoordinateResetStep
();
// is it OK to construct a new step every time?
const
auto
adjusted_step
=
make_tensor_coordinate_step
(
src_desc
,
adjusted_step_idx
,
src_move_slice_window_step_hack
);
move_tensor_coordinate
(
src_desc
,
src_coord_
,
adjusted_step
);
}
private:
SrcCoord
src_coord_
;
};
// namespace ck
...
...
host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nhwc_kyxc_nhwk.hpp
View file @
f87dddae
...
...
@@ -66,8 +66,8 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nhwc_kyxc_nhwk(
constexpr index_t HoPerBlock = 16;
constexpr index_t WoPerBlock = 16;
constexpr index_t E1 =
2
;
constexpr index_t E2 =
8
;
constexpr index_t E1 =
4
;
constexpr index_t E2 =
4
;
constexpr index_t EPerBlock = 2;
constexpr index_t KPerThread = KPerBlock;
...
...
@@ -75,8 +75,8 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nhwc_kyxc_nhwk(
constexpr index_t WoPerThread = 2;
constexpr index_t EPerThread = 1;
using ABlockTransferThreadSliceLengths_E0_E1_K_E2 = Sequence<1, 1, 1,
8
>;
using ABlockTransferThreadClusterLengths_E0_E1_K_E2 = Sequence<1, EPerBlock,
16,
1>;
using ABlockTransferThreadSliceLengths_E0_E1_K_E2 = Sequence<1, 1, 1,
E2
>;
using ABlockTransferThreadClusterLengths_E0_E1_K_E2 = Sequence<1, E
1, K
PerBlock, 1>;
constexpr index_t ABlockTransferSrcScalarPerVector_E2 = E2;
constexpr index_t ABlockTransferDstScalarPerVector_E2 = E2;
...
...
@@ -100,7 +100,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nhwc_kyxc_nhwk(
constexpr
index_t
KPerThread
=
KPerBlock
;
constexpr
index_t
HoPerThread
=
2
;
constexpr
index_t
WoPerThread
=
2
;
constexpr
index_t
EPerThread
=
EPerBlock
;
constexpr
index_t
EPerThread
=
1
;
using
ABlockTransferThreadSliceLengths_E0_E1_K_E2
=
Sequence
<
1
,
9
,
1
,
E2
>
;
using
ABlockTransferThreadClusterLengths_E0_E1_K_E2
=
Sequence
<
1
,
EPerBlock
,
16
,
1
>
;
...
...
host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nhwc_kyxc_nhwk.hpp
View file @
f87dddae
...
...
@@ -53,7 +53,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nhwc_kyxc_nhwk_outp
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
const
auto
N
=
in_n_hi_wi_c_global_desc
.
GetLength
(
I0
);
const
auto
Hi
=
in_n_hi_wi_c_global_desc
.
GetLength
(
I1
);
...
...
@@ -268,13 +267,14 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nhwc_kyxc_nhwk_outp
const
auto
grid_size
=
(
K
/
KPerBlock
)
*
(
Hop
/
HoPerBlock
)
*
(
Wop
/
WoPerBlock
)
*
N
;
constexpr
bool
has_main_k_block_loop
=
(
E1
+
E1PerBlock
)
/
(
2
*
E1PerBlock
)
>
1
;
constexpr
bool
has_main_k_block_loop
=
(
E1
+
E1PerBlock
)
/
(
2
*
E1PerBlock
)
>
1
;
constexpr
bool
has_double_tail_k_block_loop
=
(
E1
/
E1PerBlock
)
%
2
==
0
;
const
bool
has_e0_block_loop
=
E0
>
1
;
std
::
cerr
<<
"has_main_k_block_loop = "
<<
has_main_k_block_loop
<<
" has_double_tail_k_block_loop = "
<<
has_double_tail_k_block_loop
<<
std
::
endl
;
<<
" has_e0_block_loop = "
<<
has_e0_block_loop
<<
std
::
endl
;
const
auto
c_blockid_to_k_n_ho_wo_block_cluster_adaptor
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_pass_through_transform
(
I0
)),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment