Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
31a440b9
Commit
31a440b9
authored
Sep 15, 2021
by
Jing Zhang
Browse files
fixed; tuning
parent
b505370e
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
205 additions
and
108 deletions
+205
-108
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
...ernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
+184
-86
host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nhwc_kyxc_nhwk.hpp
...ution_forward_implicit_gemm_v5r1_dlops_nhwc_kyxc_nhwk.hpp
+5
-5
host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nhwc_kyxc_nhwk.hpp
...ution_forward_implicit_gemm_v5r1_dlops_nhwc_kyxc_nhwk.hpp
+16
-17
No files found.
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
View file @
31a440b9
...
...
@@ -19,8 +19,8 @@ template <typename GridwiseGemm,
typename
BGridDesc_E0_E1_N_Ho_Wo_E2
,
typename
CGridDesc_K_N_Ho_Wo
,
typename
CBlockIdToBlockClusterAdaptor_K_N_Ho_Wo
,
bool
HasMain
K
BlockLoop
,
bool
HasDoubleTail
K
BlockLoop
>
bool
HasMain
E1
BlockLoop
,
bool
HasDoubleTail
E1
BlockLoop
>
__global__
void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
...
...
@@ -46,8 +46,8 @@ __global__ void
a_e0_e1_k_e2_grid_desc
,
b_e0_e1_n_ho_wo_e2_grid_desc
,
c_k_n_ho_wo_grid_desc
,
integral_constant
<
bool
,
HasMain
K
BlockLoop
>
{},
integral_constant
<
bool
,
HasDoubleTail
K
BlockLoop
>
{});
integral_constant
<
bool
,
HasMain
E1
BlockLoop
>
{},
integral_constant
<
bool
,
HasDoubleTail
E1
BlockLoop
>
{});
}
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
// pass tensor descriptor by CONSTANT void pointer
...
...
@@ -60,8 +60,8 @@ template <typename GridwiseGemm,
typename
BGridDesc_E0_E1_N_Ho_Wo_E2
,
typename
CGridDesc_K_N_Ho_Wo
,
typename
CBlockIdToBlockClusterAdaptor_K_N_Ho_Wo
,
bool
HasMain
K
BlockLoop
,
bool
HasDoubleTail
K
BlockLoop
>
bool
HasMain
E1
BlockLoop
,
bool
HasDoubleTail
E1
BlockLoop
>
__global__
void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
...
...
@@ -96,8 +96,8 @@ __global__ void
a_e0_e1_k_e2_grid_desc
,
b_e0_e1_n_ho_wo_e2_grid_desc
,
c_k_n_ho_wo_grid_desc
,
integral_constant
<
bool
,
HasMain
K
BlockLoop
>
{},
integral_constant
<
bool
,
HasDoubleTail
K
BlockLoop
>
{});
integral_constant
<
bool
,
HasMain
E1
BlockLoop
>
{},
integral_constant
<
bool
,
HasDoubleTail
E1
BlockLoop
>
{});
}
#endif
...
...
@@ -109,8 +109,8 @@ template <index_t BlockSize,
typename
AGlobalDesc_E0_E1_K_E2
,
typename
BGlobalDesc_E0_E1_N_Ho_Wo_E2
,
typename
CGlobalDesc_K_N_Ho_Wo
,
index_t
E1
,
index_t
E2
,
index_t
E1
_
,
index_t
E2
_
,
index_t
KPerBlock
,
index_t
HoPerBlock
,
index_t
WoPerBlock
,
...
...
@@ -148,6 +148,9 @@ struct GridwiseGemmDlops_km_kn_mn_v3
static
constexpr
auto
I4
=
Number
<
4
>
{};
static
constexpr
auto
I5
=
Number
<
5
>
{};
static
constexpr
auto
E1
=
Number
<
E1_
>
{};
static
constexpr
auto
E2
=
Number
<
E2_
>
{};
__host__
__device__
static
constexpr
index_t
GetSharedMemoryNumberOfByte
()
{
constexpr
auto
max_lds_align
=
Number
<
ABlockTransferDstScalarPerVector_E2
>
{};
...
...
@@ -164,7 +167,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
return
a_block_space_size
*
sizeof
(
FloatAB
);
}
template
<
bool
HasMain
K
BlockLoop
,
bool
HasDoubleTail
K
BlockLoop
>
template
<
bool
HasMain
E1
BlockLoop
,
bool
HasDoubleTail
E1
BlockLoop
>
__device__
static
void
Run
(
const
FloatAB
*
__restrict__
p_a_global
,
const
FloatAB
*
__restrict__
p_b_global
,
FloatC
*
__restrict__
p_c_global
,
...
...
@@ -172,8 +175,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
const
AGlobalDesc_E0_E1_K_E2
&
a_e0_e1_k_e2_global_desc
,
const
BGlobalDesc_E0_E1_N_Ho_Wo_E2
&
b_e0_e1_n_ho_wo_e2_global_desc
,
const
CGlobalDesc_K_N_Ho_Wo
&
c_k_n_ho_wo_global_desc
,
integral_constant
<
bool
,
HasMain
K
BlockLoop
>
,
integral_constant
<
bool
,
HasDoubleTail
K
BlockLoop
>
)
integral_constant
<
bool
,
HasMain
E1
BlockLoop
>
,
integral_constant
<
bool
,
HasDoubleTail
E1
BlockLoop
>
)
{
const
auto
a_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_a_global
,
a_e0_e1_k_e2_global_desc
.
GetElementSpaceSize
());
...
...
@@ -192,7 +195,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
const
auto
Wo
=
b_e0_e1_n_ho_wo_e2_global_desc
.
GetLength
(
I4
);
// divide block work by [M, N]
#if
1
#if
0
const auto ho_block_work_num = Ho / Number<HoPerBlock>{};
const auto wo_block_work_num = Wo / Number<WoPerBlock>{};
const auto hwo_block_work_num = ho_block_work_num * wo_block_work_num;
...
...
@@ -356,10 +359,14 @@ struct GridwiseGemmDlops_km_kn_mn_v3
const
auto
E0
=
b_e0_e1_n_ho_wo_e2_global_desc
.
GetLength
(
I0
);
constexpr
auto
HasMainE0BlockLoop
=
false
;
if
constexpr
(
HasMainE0BlockLoop
)
{
index_t
e0_block_data_begin
=
0
;
//
do
//
{
do
{
// LDS double buffer: preload data
{
a_blockwise_copy
.
RunRead
(
...
...
@@ -377,7 +384,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
__syncthreads
();
if
constexpr
(
HasMain
K
BlockLoop
)
if
constexpr
(
HasMain
E1
BlockLoop
)
{
index_t
e1_block_data_begin
=
0
;
...
...
@@ -422,7 +429,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
}
// LDS double buffer: tail
if
constexpr
(
HasDoubleTail
K
BlockLoop
)
// if has 2 iteration left
if
constexpr
(
HasDoubleTail
E1
BlockLoop
)
// if has 2 iteration left
{
b_threadwise_transfer
.
MoveSrcSliceWindow
(
b_e0_e1_n_ho_wo_e2_global_desc
,
b_thread_slice_copy_step
);
...
...
@@ -448,18 +455,109 @@ struct GridwiseGemmDlops_km_kn_mn_v3
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_even_buf
,
c_thread_buf
);
}
// a_blockwise_copy.MoveSrcSliceWindow(a_e0_e1_k_e2_global_desc,
// a_block_slice_copy_step,
// AGlobalMoveSliceWindowStepHacks{});
a_blockwise_copy
.
MoveSrcSliceWindow
(
a_e0_e1_k_e2_global_desc
,
a_block_slice_copy_step
,
AGlobalMoveSliceWindowStepHacks
{});
blockwise_gemm
.
MoveABlockSliceWindow
(
make_tuple
(
-
(
E1
-
EPerBlock
),
0
,
0
));
b_threadwise_transfer
.
MoveSrcSliceWindow
(
b_e0_e1_n_ho_wo_e2_global_desc
,
b_thread_slice_copy_step
);
e0_block_data_begin
+=
1
;
}
while
(
e0_block_data_begin
<
E0
);
}
else
{
// LDS double buffer: preload data
{
a_blockwise_copy
.
RunRead
(
a_e0_e1_k_e2_global_desc
,
a_global_buf
,
a_e0_e1_k_e2_global_step_hacks
);
b_threadwise_transfer
.
Run
(
b_e0_e1_n_ho_wo_e2_global_desc
,
b_global_buf
,
b_e0_e1_n_ho_wo_e2_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
,
I0
,
I0
),
b_thread_even_buf
,
b_e0_e1_n_ho_wo_e2_global_step_hacks
);
a_blockwise_copy
.
RunWrite
(
a_e0_e1_k_e2_block_desc
,
a_block_buf
);
}
__syncthreads
();
if
constexpr
(
HasMainE1BlockLoop
)
{
index_t
e1_block_data_begin
=
0
;
// blockwise_gemm.MoveABlockSliceWindow(make_tuple(-(E1 - EPerBlock), 0, 0));
// LDS double buffer: main body
// use Do-While loop instead of For loop to simplify control flow
do
{
// even iteration
b_threadwise_transfer
.
MoveSrcSliceWindow
(
b_e0_e1_n_ho_wo_e2_global_desc
,
b_thread_slice_copy_step
);
// b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_ho_wo_e2_global_desc,
// b_thread_slice_copy_step);
b_threadwise_transfer
.
Run
(
b_e0_e1_n_ho_wo_e2_global_desc
,
b_global_buf
,
b_e0_e1_n_ho_wo_e2_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
,
I0
,
I0
),
b_thread_odd_buf
,
b_e0_e1_n_ho_wo_e2_global_step_hacks
);
// e0_block_data_begin += 1;
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_even_buf
,
c_thread_buf
);
//} while(e0_block_data_begin < E0);
blockwise_gemm
.
MoveABlockSliceWindow
(
make_tuple
(
EPerBlock
,
0
,
0
));
b_threadwise_transfer
.
MoveSrcSliceWindow
(
b_e0_e1_n_ho_wo_e2_global_desc
,
b_thread_slice_copy_step
);
b_threadwise_transfer
.
Run
(
b_e0_e1_n_ho_wo_e2_global_desc
,
b_global_buf
,
b_e0_e1_n_ho_wo_e2_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
,
I0
,
I0
),
b_thread_even_buf
,
b_e0_e1_n_ho_wo_e2_global_step_hacks
);
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_odd_buf
,
c_thread_buf
);
blockwise_gemm
.
MoveABlockSliceWindow
(
make_tuple
(
EPerBlock
,
0
,
0
));
e1_block_data_begin
+=
2
*
EPerBlock
;
}
while
(
e1_block_data_begin
<
E1
-
2
*
EPerBlock
);
}
// LDS double buffer: tail
if
constexpr
(
HasDoubleTailE1BlockLoop
)
// if has 2 iteration left
{
b_threadwise_transfer
.
MoveSrcSliceWindow
(
b_e0_e1_n_ho_wo_e2_global_desc
,
b_thread_slice_copy_step
);
b_threadwise_transfer
.
Run
(
b_e0_e1_n_ho_wo_e2_global_desc
,
b_global_buf
,
b_e0_e1_n_ho_wo_e2_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
,
I0
,
I0
),
b_thread_odd_buf
,
b_e0_e1_n_ho_wo_e2_global_step_hacks
);
// LDS double buffer: GEMM on 2nd-last data
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_even_buf
,
c_thread_buf
);
blockwise_gemm
.
MoveABlockSliceWindow
(
make_tuple
(
EPerBlock
,
0
,
0
));
// LDS double buffer: GEMM on last data
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_odd_buf
,
c_thread_buf
);
}
else
// if has 1 iteration left
{
// LDS double buffer: GEMM on last data
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_even_buf
,
c_thread_buf
);
}
}
// output: register to global memory
{
...
...
host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nhwc_kyxc_nhwk.hpp
View file @
31a440b9
...
...
@@ -93,17 +93,17 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nhwc_kyxc_nhwk(
constexpr
index_t
HoPerBlock
=
8
;
constexpr
index_t
WoPerBlock
=
32
;
constexpr
index_t
E1
=
2
*
9
;
constexpr
index_t
E2
=
8
;
constexpr
index_t
EPerBlock
=
2
;
constexpr
index_t
E1
=
4
*
9
;
constexpr
index_t
E2
=
4
;
constexpr
index_t
EPerBlock
=
4
;
constexpr
index_t
KPerThread
=
KPerBlock
;
constexpr
index_t
HoPerThread
=
2
;
constexpr
index_t
WoPerThread
=
2
;
constexpr
index_t
EPerThread
=
1
;
constexpr
index_t
EPerThread
=
EPerBlock
;
using
ABlockTransferThreadSliceLengths_E0_E1_K_E2
=
Sequence
<
1
,
9
,
1
,
E2
>
;
using
ABlockTransferThreadClusterLengths_E0_E1_K_E2
=
Sequence
<
1
,
2
,
16
,
1
>
;
using
ABlockTransferThreadClusterLengths_E0_E1_K_E2
=
Sequence
<
1
,
EPerBlock
,
16
,
1
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_E2
=
E2
;
constexpr
index_t
ABlockTransferDstScalarPerVector_E2
=
E2
;
...
...
host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nhwc_kyxc_nhwk.hpp
View file @
31a440b9
...
...
@@ -95,7 +95,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nhwc_kyxc_nhwk_outp
const
auto
C0
=
C
/
E2
;
const
auto
E
=
Y
*
X
*
C0
;
const
auto
E0
=
E
/
E1
;
// weight tensor
...
...
@@ -198,13 +197,13 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nhwc_kyxc_nhwk_outp
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}),
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}));
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}));
constexpr
auto
b_e0_e1_n_ho_wo_e2_global_move_slice_window_step_hack
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{};
...
...
@@ -243,18 +242,18 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nhwc_kyxc_nhwk_outp
EPerThread
,
ABlockTransferThreadSliceLengths_E0_E1_K_E2
,
ABlockTransferThreadClusterLengths_E0_E1_K_E2
,
Sequence
<
2
,
0
,
1
,
3
>
,
Sequence
<
2
,
0
,
1
,
3
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
3
,
ABlockTransferSrcScalarPerVector_E2
,
ABlockTransferDstScalarPerVector_E2
,
false
,
// don't move back src coordinate after threadwise copy
Sequence
<
0
,
2
,
3
,
4
,
1
,
5
>
,
Sequence
<
2
,
0
,
1
,
3
,
4
,
5
>
,
5
,
BThreadTransferSrcScalarPerVector_E2
,
false
,
// don't move back src coordinate after threadwise copy, which will be fused with
// MoveSrcSliceWindow() to save addr computation
Sequence
<
2
,
3
,
1
,
0
>
,
Sequence
<
1
,
2
,
3
,
0
>
,
0
,
CThreadTransferDstScalarPerVector_K
,
decltype
(
a_e0_e1_k_e2_global_step_hacks
),
...
...
@@ -269,17 +268,17 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nhwc_kyxc_nhwk_outp
const
auto
grid_size
=
(
K
/
KPerBlock
)
*
(
Hop
/
HoPerBlock
)
*
(
Wop
/
WoPerBlock
)
*
N
;
const
bool
has_main_k_block_loop
=
(
E1
+
E1PerBlock
)
/
(
2
*
E1PerBlock
)
>
1
;
const
expr
bool
has_main_k_block_loop
=
(
E1
+
E1PerBlock
)
/
(
2
*
E1PerBlock
)
>
1
;
const
bool
has_double_tail_k_block_loop
=
(
E1
/
E1PerBlock
)
%
2
==
0
;
const
expr
bool
has_double_tail_k_block_loop
=
(
E1
/
E1PerBlock
)
%
2
==
0
;
std
::
cerr
<<
"has_main_k_block_loop = "
<<
has_main_k_block_loop
<<
" has_double_tail_k_block_loop = "
<<
has_double_tail_k_block_loop
<<
std
::
endl
;
const
auto
c_blockid_to_k_n_ho_wo_block_cluster_adaptor
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_
merge_transform
(
make_tuple
(
I0
,
I0
)
)),
make_tuple
(
Sequence
<
0
,
1
>
{}),
make_single_stage_tensor_adaptor
(
make_tuple
(
make_
pass_through_transform
(
I0
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
using
CBlockIdToBlockClusterAdaptor_K_N_Ho_Wo
=
...
...
@@ -288,7 +287,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nhwc_kyxc_nhwk_outp
#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
float
ave_time
=
0
;
if
(
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
if
constexpr
(
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_gemm_dlops_v2
<
GridwiseGemm
,
...
...
@@ -314,7 +313,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nhwc_kyxc_nhwk_outp
c_k_n_hop_wop_grid_desc
,
c_blockid_to_k_n_ho_wo_block_cluster_adaptor
);
}
else
if
(
has_main_k_block_loop
&&
!
has_double_tail_k_block_loop
)
else
if
constexpr
(
has_main_k_block_loop
&&
!
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_gemm_dlops_v2
<
GridwiseGemm
,
...
...
@@ -340,7 +339,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nhwc_kyxc_nhwk_outp
c_k_n_hop_wop_grid_desc
,
c_blockid_to_k_n_ho_wo_block_cluster_adaptor
);
}
else
if
(
!
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
else
if
constexpr
(
!
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_gemm_dlops_v2
<
GridwiseGemm
,
...
...
@@ -409,7 +408,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nhwc_kyxc_nhwk_outp
float
ave_time
=
0
;
if
(
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
if
constexpr
(
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_gemm_dlops_v2
<
GridwiseGemm
,
...
...
@@ -440,7 +439,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nhwc_kyxc_nhwk_outp
cast_pointer_to_constant_address_space
(
c_blockid_to_k_n_ho_wo_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
()));
}
else
if
(
has_main_k_block_loop
&&
!
has_double_tail_k_block_loop
)
else
if
constexpr
(
has_main_k_block_loop
&&
!
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_gemm_dlops_v2
<
GridwiseGemm
,
...
...
@@ -471,7 +470,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nhwc_kyxc_nhwk_outp
cast_pointer_to_constant_address_space
(
c_blockid_to_k_n_ho_wo_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
()));
}
else
if
(
!
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
else
if
constexpr
(
!
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_gemm_dlops_v2
<
GridwiseGemm
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment