Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
78740e43
Commit
78740e43
authored
Jul 22, 2021
by
Jing Zhang
Browse files
make static v5r1
parent
12649254
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
1069 additions
and
39 deletions
+1069
-39
composable_kernel/include/driver/driver_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
...tion_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
+383
-0
composable_kernel/include/tensor_operation/gridwise_static_gemm_v2.hpp
...rnel/include/tensor_operation/gridwise_static_gemm_v2.hpp
+453
-0
composable_kernel/include/utility/config.hpp
composable_kernel/include/utility/config.hpp
+3
-3
host/driver_offline/conv_fwd_driver_offline.cpp
host/driver_offline/conv_fwd_driver_offline.cpp
+26
-26
host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+5
-5
host/driver_offline/include/device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+192
-0
script/cmake-rocm.sh
script/cmake-rocm.sh
+2
-2
script/run.sh
script/run.sh
+5
-3
No files found.
composable_kernel/include/driver/driver_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
0 → 100644
View file @
78740e43
#ifndef CK_DRIVER_STATIC_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_OUTPAD_HPP
#define CK_DRIVER_STATIC_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_OUTPAD_HPP
#include "common_header.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "gridwise_static_gemm_v2.hpp"
#include "gridwise_operation_wrapper.hpp"
namespace
ck
{
template
<
index_t
BlockSize
,
typename
FloatAB
,
typename
FloatAcc
,
typename
FloatC
,
index_t
KPerBlock
,
index_t
HoPerBlock
,
index_t
WoPerBlock
,
index_t
EPerBlock
,
index_t
KPerThread
,
index_t
HoPerThread
,
index_t
WoPerThread
,
index_t
EPerThread
,
typename
ABlockTransferThreadSliceLengths_E_K
,
typename
ABlockTransferThreadClusterLengths_E_K
,
index_t
ABlockTransferSrcScalarPerVector_E
,
index_t
ABlockTransferDstScalarPerVector_K
,
index_t
BThreadTransferSrcScalarPerVector_W
,
index_t
CThreadTransferDstScalarPerVector_W
>
struct
DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
{
template
<
typename
...
Wei
,
typename
...
In
,
typename
...
Out
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
__host__
void
Run
(
const
DynamicTensorDescriptor
<
Wei
...
>&
wei_k_c_y_x_global_desc
,
const
DynamicTensorDescriptor
<
In
...
>&
in_n_c_hi_wi_global_desc
,
const
DynamicTensorDescriptor
<
Out
...
>&
out_n_k0_ho_wo_global_desc
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads_
,
const
FloatAB
*
__restrict__
p_wei_global
,
const
FloatAB
*
__restrict__
p_in_global
,
FloatC
*
__restrict__
p_out_global
)
const
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I4
=
Number
<
4
>
{};
const
auto
N_
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I0
);
const
auto
C_
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I1
);
const
auto
K0_
=
out_n_k0_ho_wo_global_desc
.
GetLength
(
I1
);
const
auto
Hi_
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I2
);
const
auto
Wi_
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I3
);
const
auto
Ho_
=
out_n_k0_ho_wo_global_desc
.
GetLength
(
I2
);
const
auto
Wo_
=
out_n_k0_ho_wo_global_desc
.
GetLength
(
I3
);
const
auto
K_
=
wei_k_c_y_x_global_desc
.
GetLength
(
I0
);
const
auto
Y_
=
wei_k_c_y_x_global_desc
.
GetLength
(
I2
);
const
auto
X_
=
wei_k_c_y_x_global_desc
.
GetLength
(
I3
);
constexpr
auto
N
=
Number
<
N_
>
{};
constexpr
auto
C
=
Number
<
C_
>
{};
constexpr
auto
K0
=
Number
<
K0_
>
{};
constexpr
auto
Hi
=
Number
<
Hi_
>
{};
constexpr
auto
Wi
=
Number
<
Wi_
>
{};
constexpr
auto
Ho
=
Number
<
Ho_
>
{};
constexpr
auto
Wo
=
Number
<
Wo_
>
{};
constexpr
auto
K
=
Number
<
K_
>
{};
constexpr
auto
Y
=
Number
<
Y_
>
{};
constexpr
auto
X
=
Number
<
X_
>
{};
const
auto
ConvStrideH_
=
conv_strides
[
I0
];
const
auto
ConvStrideW_
=
conv_strides
[
I1
];
const
auto
ConvDilationH_
=
conv_dilations
[
I0
];
const
auto
ConvDilationW_
=
conv_dilations
[
I1
];
constexpr
auto
ConvStrideH
=
Number
<
ConvStrideH_
>
{};
constexpr
auto
ConvStrideW
=
Number
<
ConvStrideW_
>
{};
constexpr
auto
ConvDilationH
=
Number
<
ConvDilationH_
>
{};
constexpr
auto
ConvDilationW
=
Number
<
ConvDilationW_
>
{};
constexpr
auto
Hop
=
Number
<
(
Ho
+
HoPerBlock
-
1
)
/
HoPerBlock
*
HoPerBlock
>
{};
constexpr
auto
Wop
=
Number
<
(
Wo
+
WoPerBlock
-
1
)
/
WoPerBlock
*
WoPerBlock
>
{};
constexpr
auto
OutRightPadH
=
Hop
-
Ho
;
constexpr
auto
OutRightPadW
=
Wop
-
Wo
;
const
auto
InLeftPadH_
=
in_left_pads
[
I0
];
const
auto
InLeftPadW_
=
in_left_pads
[
I1
];
constexpr
auto
InLeftPadH
=
Number
<
InLeftPadH_
>
{};
constexpr
auto
InLeftPadW
=
Number
<
InLeftPadW_
>
{};
constexpr
auto
in_right_pads
=
InRightPads
{};
const
auto
InRightPadH_
=
in_right_pads
[
I0
]
+
OutRightPadH
*
ConvStrideH
;
const
auto
InRightPadW_
=
in_right_pads
[
I1
]
+
OutRightPadW
*
ConvStrideW
;
constexpr
auto
InRightPadH
=
Number
<
InRightPadH_
>
{};
constexpr
auto
InRightPadW
=
Number
<
InRightPadW_
>
{};
std
::
cerr
<<
"OutRightPadH = "
<<
OutRightPadH
<<
" OutRightPadW = "
<<
OutRightPadW
<<
std
::
endl
;
std
::
cerr
<<
"InRightPadH = "
<<
InRightPadH
<<
" InRightPadW = "
<<
InRightPadW
<<
std
::
endl
;
// weight tensor
const
auto
wei_e_k_global_desc
=
transform_dynamic_tensor_descriptor
(
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
K
,
C
*
Y
*
X
)),
make_tuple
(
make_pass_through_transform
(
K
),
make_pass_through_transform
(
C
*
Y
*
X
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{}));
static_assert
(
wei_e_k_global_desc
.
IsKnownAtCompileTime
(),
"wrong! wei_e_k_global_desc need to known at compile-time"
);
// input tensor
const
auto
in_n_c_hip_wip_global_desc
=
transform_dynamic_tensor_descriptor
(
in_n_c_hi_wi_global_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_pass_through_transform
(
C
),
make_pad_transform
(
Hi
,
InLeftPadH
,
InRightPadH
),
make_pad_transform
(
Wi
,
InLeftPadW
,
InRightPadW
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
static_assert
(
in_n_c_hip_wip_global_desc
.
IsKnownAtCompileTime
(),
"wrong! in_n_c_hip_wip_global_desc need to known at compile-time"
);
const
auto
in_n_c_y_ho_x_wo_global_desc
=
transform_dynamic_tensor_descriptor
(
in_n_c_hip_wip_global_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_pass_through_transform
(
C
),
make_embed_transform
(
make_tuple
(
Y
,
Hop
),
make_tuple
(
ConvDilationH
,
ConvStrideH
)),
make_embed_transform
(
make_tuple
(
X
,
Wop
),
make_tuple
(
ConvDilationW
,
ConvStrideW
))),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{}));
static_assert
(
in_n_c_y_ho_x_wo_global_desc
.
IsKnownAtCompileTime
(),
"wrong! in_n_c_y_ho_x_wo_global_desc need to known at compile-time"
);
const
auto
in_e_n_ho_wo_global_desc
=
transform_dynamic_tensor_descriptor
(
in_n_c_y_ho_x_wo_global_desc
,
make_tuple
(
make_merge_transform
(
make_tuple
(
C
,
Y
,
X
)),
make_pass_through_transform
(
N
),
make_pass_through_transform
(
Hop
),
make_pass_through_transform
(
Wop
)),
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
>
{},
Sequence
<
3
>
{},
Sequence
<
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
static_assert
(
in_e_n_ho_wo_global_desc
.
IsKnownAtCompileTime
(),
"wrong! in_e_n_ho_wo_global_desc need to known at compile-time"
);
// output tensor
const
auto
out_k_n_hop_wop_global_desc
=
transform_dynamic_tensor_descriptor
(
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
N
,
K0
,
Ho
,
Wo
)),
make_tuple
(
make_pass_through_transform
(
K0
),
make_pass_through_transform
(
N
),
make_pad_transform
(
Ho
,
I0
,
OutRightPadH
),
make_pad_transform
(
Wo
,
I0
,
OutRightPadW
)),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
static_assert
(
out_k_n_hop_wop_global_desc
.
IsKnownAtCompileTime
(),
"wrong! out_k_n_hop_wop_global_desc need to known at compile-time"
);
const
auto
E
=
C
*
Y
*
X
;
std
::
cerr
<<
"Hop = "
<<
Hop
<<
" Wop = "
<<
Wop
<<
std
::
endl
;
if
(
!
((
K
%
KPerBlock
)
==
0
&&
(
Hop
%
HoPerBlock
)
==
0
&&
(
Wop
%
WoPerBlock
)
==
0
&&
(
E
%
EPerBlock
)
==
0
))
{
throw
std
::
runtime_error
(
"wrong! GEMM size no divisible"
);
}
// hack to control index calculation when iterating over a_k_m_global tensor
constexpr
auto
a_e_k_global_iterator_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
>
{}));
constexpr
auto
a_e_k_global_move_slice_window_iterator_hack
=
Sequence
<
0
,
0
,
0
>
{};
constexpr
auto
b_e_n_ho_wo_global_iterator_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}));
constexpr
auto
b_e_n_ho_wo_global_move_slice_window_iterator_hack
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
>
{};
// hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
// hack for NKHW format
constexpr
auto
c_k_n_ho_wo_global_tensor_iterator_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}));
// GEMM
using
gridwise_gemm
=
GridwiseStaticGemm_km_kn_mn_v3
<
BlockSize
,
FloatAB
,
FloatAcc
,
FloatC
,
InMemoryDataOperation
::
Set
,
decltype
(
wei_e_k_global_desc
),
decltype
(
in_e_n_ho_wo_global_desc
),
decltype
(
out_k_n_hop_wop_global_desc
),
KPerBlock
,
HoPerBlock
,
WoPerBlock
,
EPerBlock
,
KPerThread
,
HoPerThread
,
WoPerThread
,
EPerThread
,
ABlockTransferThreadSliceLengths_E_K
,
ABlockTransferThreadClusterLengths_E_K
,
Sequence
<
1
,
0
>
,
Sequence
<
1
,
0
>
,
0
,
ABlockTransferSrcScalarPerVector_E
,
ABlockTransferDstScalarPerVector_K
,
false
,
// don't move back src coordinate after threadwise copy
Sequence
<
0
,
2
,
3
,
1
>
,
3
,
BThreadTransferSrcScalarPerVector_W
,
false
,
// don't move back src coordinate after threadwise copy, which will be fused with
// MoveSrcSliceWindow() to save addr computation
Sequence
<
0
,
2
,
3
,
1
>
,
0
,
CThreadTransferDstScalarPerVector_W
,
decltype
(
a_e_k_global_iterator_hacks
),
decltype
(
b_e_n_ho_wo_global_iterator_hacks
),
decltype
(
c_k_n_ho_wo_global_tensor_iterator_hacks
),
decltype
(
a_e_k_global_move_slice_window_iterator_hack
),
decltype
(
b_e_n_ho_wo_global_move_slice_window_iterator_hack
)
>
;
const
auto
GridSize
=
(
K
/
KPerBlock
)
*
(
Hop
/
HoPerBlock
)
*
(
Wop
/
WoPerBlock
)
*
N
;
constexpr
bool
has_main_k_block_loop
=
(
E
+
EPerBlock
)
/
(
2
*
EPerBlock
)
>
1
;
constexpr
bool
has_double_tail_k_block_loop
=
(
E
/
EPerBlock
)
%
2
==
0
;
index_t
nrepeat
=
100
;
#if 1
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
std
::
cout
<<
"Start running "
<<
nrepeat
<<
" times..."
<<
std
::
endl
;
KernelTimer
timer
;
timer
.
Start
();
std
::
cout
<<
"has_main_k_block_loop: "
<<
has_main_k_block_loop
<<
" has_double_tail_k_block_loop: "
<<
has_double_tail_k_block_loop
<<
std
::
endl
;
for
(
index_t
j
=
0
;
j
<
nrepeat
;
++
j
)
{
if
constexpr
(
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
const
FloatAB
*
,
const
FloatAB
*
,
FloatC
*
,
integral_constant
<
bool
,
true
>
,
integral_constant
<
bool
,
true
>>
;
launch_kernel
(
kernel
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
p_wei_global
,
p_in_global
,
p_out_global
,
integral_constant
<
bool
,
true
>
{},
integral_constant
<
bool
,
true
>
{});
}
else
if
constexpr
(
has_main_k_block_loop
&&
!
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
const
FloatAB
*
,
const
FloatAB
*
,
FloatC
*
,
integral_constant
<
bool
,
true
>
,
integral_constant
<
bool
,
false
>>
;
launch_kernel
(
kernel
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
p_wei_global
,
p_in_global
,
p_out_global
,
integral_constant
<
bool
,
true
>
{},
integral_constant
<
bool
,
false
>
{});
}
else
if
constexpr
(
!
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
const
FloatAB
*
,
const
FloatAB
*
,
FloatC
*
,
integral_constant
<
bool
,
false
>
,
integral_constant
<
bool
,
true
>>
;
launch_kernel
(
kernel
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
p_wei_global
,
p_in_global
,
p_out_global
,
integral_constant
<
bool
,
false
>
{},
integral_constant
<
bool
,
true
>
{});
}
else
{
const
auto
kernel
=
run_gridwise_operation
<
gridwise_gemm
,
const
FloatAB
*
,
const
FloatAB
*
,
FloatC
*
,
integral_constant
<
bool
,
false
>
,
integral_constant
<
bool
,
false
>>
;
launch_kernel
(
kernel
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
p_wei_global
,
p_in_global
,
p_out_global
,
integral_constant
<
bool
,
false
>
{},
integral_constant
<
bool
,
false
>
{});
}
}
timer
.
End
();
float
ave_time
=
timer
.
GetElapsedTime
()
/
nrepeat
;
float
perf
=
(
float
)
calculate_convolution_flops
(
in_n_c_hi_wi_global_desc
,
wei_k_c_y_x_global_desc
,
out_n_k0_ho_wo_global_desc
)
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
;
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
}
#endif
}
};
}
// namespace ck
#endif
composable_kernel/include/tensor_operation/gridwise_static_gemm_v2.hpp
0 → 100644
View file @
78740e43
#ifndef CK_GRIDWISE_STATIC_GEMM_V2_HPP
#define CK_GRIDWISE_STATIC_GEMM_V2_HPP
#include "common_header.hpp"
#include "dynamic_multi_index_transform_helper.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "blockwise_dynamic_tensor_slice_transfer.hpp"
#include "threadwise_dynamic_tensor_slice_transfer.hpp"
#include "blockwise_gemm_v3.hpp"
namespace
ck
{
template
<
index_t
BlockSize
,
typename
FloatAB
,
typename
FloatAcc
,
typename
FloatC
,
InMemoryDataOperation
CGlobalMemoryDataOperation
,
typename
AGlobalDesc
,
typename
BGlobalDesc
,
typename
CGlobalDesc
,
index_t
KPerBlock
,
index_t
HoPerBlock
,
index_t
WoPerBlock
,
index_t
EPerBlock
,
index_t
KPerThread
,
index_t
HoPerThread
,
index_t
WoPerThread
,
index_t
EPerThread
,
typename
ABlockTransferThreadSliceLengths_E_K
,
typename
ABlockTransferThreadClusterLengths_E_K
,
typename
ABlockTransferThreadClusterArrangeOrder
,
typename
ABlockTransferSrcAccessOrder
,
index_t
ABlockTransferSrcVectorDim
,
index_t
ABlockTransferSrcScalarPerVector
,
index_t
ABlockTransferDstScalarPerVector_K
,
bool
AThreadTransferSrcResetCoordinateAfterRun
,
typename
BBlockTransferSrcAccessOrder
,
index_t
BBlockTransferSrcVectorDim
,
index_t
BBlockTransferSrcScalarPerVector
,
bool
BThreadTransferSrcResetCoordinateAfterRun
,
typename
CThreadTransferSrcDstAccessOrder
,
index_t
CThreadTransferSrcDstVectorDim
,
index_t
CThreadTransferDstScalarPerVector
,
typename
AGlobalIteratorHacks
,
typename
BGlobalIteratorHacks
,
typename
CGlobalIteratorHacks
,
typename
AGlobalMoveSliceWindowIteratorHacks
,
typename
BGlobalMoveSliceWindowIteratorHacks
>
struct
GridwiseStaticGemm_km_kn_mn_v3
{
__host__
__device__
static
constexpr
index_t
GetSharedMemoryNumberOfByte
()
{
constexpr
auto
E
=
EPerBlock
*
3
*
3
;
constexpr
auto
max_lds_align
=
math
::
lcm
(
Number
<
ABlockTransferDstScalarPerVector_K
>
{},
Number
<
KPerBlock
>
{});
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
a_e_k_desc
=
make_dynamic_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
E
>
{},
Number
<
KPerBlock
>
{}),
max_lds_align
);
// LDS allocation for A and B: be careful of alignment
constexpr
auto
a_block_space_size
=
math
::
integer_least_multiple
(
a_e_k_desc
.
GetElementSpaceSize
(),
max_lds_align
);
return
a_block_space_size
*
sizeof
(
FloatAB
);
}
template
<
bool
HasMainKBlockLoop
,
bool
HasDoubleTailKBlockLoop
>
__device__
void
Run
(
const
FloatAB
*
__restrict__
p_a_global
,
const
FloatAB
*
__restrict__
p_b_global
,
FloatC
*
__restrict__
p_c_global
,
FloatAB
*
__restrict__
p_shared_block
,
integral_constant
<
bool
,
HasMainKBlockLoop
>
,
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
)
const
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
a_e_k_global_desc
=
AGlobalDesc
{};
constexpr
auto
b_e_n_ho_wo_global_desc
=
BGlobalDesc
{};
constexpr
auto
c_k_n_ho_wo_global_desc
=
CGlobalDesc
{};
const
auto
a_global_buf
=
make_dynamic_buffer
<
AddressSpace
::
Global
>
(
p_a_global
,
a_e_k_global_desc
.
GetElementSpaceSize
());
const
auto
b_global_buf
=
make_dynamic_buffer
<
AddressSpace
::
Global
>
(
p_b_global
,
b_e_n_ho_wo_global_desc
.
GetElementSpaceSize
());
auto
c_global_buf
=
make_dynamic_buffer
<
AddressSpace
::
Global
>
(
p_c_global
,
c_k_n_ho_wo_global_desc
.
GetElementSpaceSize
());
constexpr
auto
E
=
a_e_k_global_desc
.
GetLength
(
I0
);
constexpr
auto
K
=
a_e_k_global_desc
.
GetLength
(
I1
);
constexpr
auto
N
=
b_e_n_ho_wo_global_desc
.
GetLength
(
I1
);
constexpr
auto
Ho
=
b_e_n_ho_wo_global_desc
.
GetLength
(
I2
);
constexpr
auto
Wo
=
b_e_n_ho_wo_global_desc
.
GetLength
(
I3
);
// divide block work by [M, N]
#if 0
const auto k_block_work_num = K / Number<KPerBlock>{};
const auto ho_block_work_num = Ho / Number<HoPerBlock>{};
const auto wo_block_work_num = Wo / Number<WoPerBlock>{};
const auto hwo_block_work_num = ho_block_work_num * wo_block_work_num;
const index_t k_block_work_id = get_block_1d_id() / hwo_block_work_num;
const index_t hwo_block_work_id = get_block_1d_id() - k_block_work_id * hwo_block_work_num;
const index_t ho_block_work_id = hwo_block_work_id / wo_block_work_num;
const index_t wo_block_work_id = hwo_block_work_id - ho_block_work_id * wo_block_work_num;
#else
// Hack: this force result into SGPR
const
index_t
k_block_work_num
=
__builtin_amdgcn_readfirstlane
(
K
/
KPerBlock
);
const
index_t
ho_block_work_num
=
__builtin_amdgcn_readfirstlane
(
Ho
/
HoPerBlock
);
const
index_t
wo_block_work_num
=
__builtin_amdgcn_readfirstlane
(
Wo
/
WoPerBlock
);
const
index_t
hwo_block_work_num
=
ho_block_work_num
*
wo_block_work_num
;
const
index_t
k_block_work_id
=
__builtin_amdgcn_readfirstlane
(
get_block_1d_id
()
/
hwo_block_work_num
);
const
index_t
hwo_block_work_id
=
get_block_1d_id
()
-
k_block_work_id
*
hwo_block_work_num
;
const
index_t
ho_block_work_id
=
__builtin_amdgcn_readfirstlane
(
hwo_block_work_id
/
wo_block_work_num
);
const
index_t
wo_block_work_id
=
hwo_block_work_id
-
ho_block_work_id
*
wo_block_work_num
;
#endif
// lds max alignment
constexpr
auto
max_lds_align
=
math
::
lcm
(
Number
<
ABlockTransferDstScalarPerVector_K
>
{},
Number
<
KPerBlock
>
{});
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
a_e_k_block_desc
=
make_dynamic_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
EPerBlock
>
{},
Number
<
KPerBlock
>
{}),
max_lds_align
);
constexpr
auto
a_e_k_desc
=
make_dynamic_naive_tensor_descriptor_aligned_v2
(
make_tuple
(
Number
<
E
>
{},
Number
<
KPerBlock
>
{}),
max_lds_align
);
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
b_e_n_ho_wo_block_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
Number
<
EPerBlock
>
{},
Number
<
1
>
{},
Number
<
HoPerBlock
>
{},
Number
<
WoPerBlock
>
{}));
// c_thread_mtx definition: this is a mess
// TODO:: more elegent way of defining c_thread_mtx
constexpr
auto
c_k_n_ho_wo_thread_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
Number
<
KPerThread
>
{},
Number
<
1
>
{},
Number
<
HoPerThread
>
{},
Number
<
WoPerThread
>
{}));
auto
blockwise_gemm
=
BlockwiseGemm_km_kn_m0m1n0n1_v3
<
BlockSize
,
FloatAB
,
FloatAB
,
FloatAcc
,
decltype
(
a_e_k_block_desc
),
decltype
(
b_e_n_ho_wo_block_desc
),
decltype
(
c_k_n_ho_wo_thread_desc
),
KPerThread
,
HoPerThread
,
WoPerThread
,
EPerThread
,
ABlockTransferSrcScalarPerVector
,
ABlockTransferDstScalarPerVector_K
>
{};
auto
c_thread_mtx_index
=
blockwise_gemm
.
GetBeginOfThreadMatrixC
(
get_thread_local_1d_id
());
const
auto
k_thread_id
=
c_thread_mtx_index
.
k
;
const
auto
ho_thread_id
=
c_thread_mtx_index
.
h
;
const
auto
wo_thread_id
=
c_thread_mtx_index
.
w
;
const
index_t
k_block_data_on_global
=
k_block_work_id
*
KPerBlock
;
const
index_t
ho_block_data_on_global
=
ho_block_work_id
*
HoPerBlock
;
const
index_t
wo_block_data_on_global
=
wo_block_work_id
*
WoPerBlock
;
const
index_t
ho_thread_data_on_global
=
ho_block_data_on_global
+
ho_thread_id
*
HoPerThread
;
const
index_t
wo_thread_data_on_global
=
wo_block_data_on_global
+
wo_thread_id
*
WoPerThread
;
// A matrix blockwise copy
auto
a_blockwise_copy
=
BlockwiseDynamicTensorSliceTransfer_v4
<
BlockSize
,
InMemoryDataOperation
::
Set
,
Sequence
<
E
,
KPerBlock
>
,
ABlockTransferThreadSliceLengths_E_K
,
ABlockTransferThreadClusterLengths_E_K
,
ABlockTransferThreadClusterArrangeOrder
,
FloatAB
,
FloatAB
,
decltype
(
a_e_k_global_desc
),
decltype
(
a_e_k_desc
),
ABlockTransferSrcAccessOrder
,
Sequence
<
0
,
1
>
,
ABlockTransferSrcVectorDim
,
1
,
ABlockTransferSrcScalarPerVector
,
ABlockTransferDstScalarPerVector_K
,
1
,
1
,
AThreadTransferSrcResetCoordinateAfterRun
,
true
>
(
a_e_k_global_desc
,
make_multi_index
(
0
,
k_block_data_on_global
),
a_e_k_desc
,
make_multi_index
(
0
,
0
));
constexpr
auto
b_e_n_ho_wo_thread_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
Number
<
EPerBlock
>
{},
Number
<
1
>
{},
Number
<
HoPerThread
>
{},
Number
<
WoPerThread
>
{}));
auto
b_threadwise_transfer
=
ThreadwiseDynamicTensorSliceTransfer_v2
<
FloatAB
,
FloatAB
,
decltype
(
b_e_n_ho_wo_global_desc
),
decltype
(
b_e_n_ho_wo_thread_desc
),
Sequence
<
EPerBlock
,
1
,
HoPerThread
,
WoPerThread
>
,
BBlockTransferSrcAccessOrder
,
BBlockTransferSrcVectorDim
,
BBlockTransferSrcScalarPerVector
,
1
,
true
>
(
b_e_n_ho_wo_global_desc
,
make_multi_index
(
0
,
0
,
ho_thread_data_on_global
,
wo_thread_data_on_global
));
auto
a_block_buf
=
make_dynamic_buffer
<
AddressSpace
::
Lds
>
(
p_shared_block
,
a_e_k_desc
.
GetElementSpaceSize
());
// register allocation for output
StaticBuffer
<
AddressSpace
::
Vgpr
,
FloatAcc
,
c_k_n_ho_wo_thread_desc
.
GetElementSpaceSize
()
>
c_thread_buf
;
// initialize output thread tensor
ThreadwiseDynamicTensorSliceSet_v1
<
FloatAcc
,
decltype
(
c_k_n_ho_wo_thread_desc
),
Sequence
<
KPerThread
,
1
,
HoPerThread
,
WoPerThread
>>
{}
.
Run
(
c_k_n_ho_wo_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
c_thread_buf
,
FloatAcc
{
0
});
constexpr
auto
b_thread_slice_copy_step
=
make_multi_index
(
EPerBlock
,
0
,
0
,
0
);
// hack to control index calculation when iterating over A and B matrix for threadwise copy
constexpr
auto
a_e_k_global_iterator_hacks
=
AGlobalIteratorHacks
{};
constexpr
auto
b_e_n_ho_wo_global_iterator_hacks
=
BGlobalIteratorHacks
{};
// hack to control index calculation when move slice window for A and B matrix for
// threadwise copy
constexpr
auto
a_e_k_global_move_slice_window_iterator_hack
=
AGlobalMoveSliceWindowIteratorHacks
{};
constexpr
auto
b_e_n_ho_wo_global_move_slice_window_iterator_hack
=
BGlobalMoveSliceWindowIteratorHacks
{};
// double regsiter buffer for b
StaticBuffer
<
AddressSpace
::
Vgpr
,
FloatAB
,
b_e_n_ho_wo_thread_desc
.
GetElementSpaceSize
()
>
b_thread_even_buf
,
b_thread_odd_buf
;
// LDS double buffer: preload data
{
a_blockwise_copy
.
RunRead
(
a_e_k_global_desc
,
a_global_buf
,
a_e_k_global_iterator_hacks
);
b_threadwise_transfer
.
Run
(
b_e_n_ho_wo_global_desc
,
b_global_buf
,
b_e_n_ho_wo_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
b_thread_even_buf
,
b_e_n_ho_wo_global_iterator_hacks
);
a_blockwise_copy
.
RunWrite
(
a_e_k_desc
,
a_block_buf
);
}
__syncthreads
();
if
constexpr
(
HasMainKBlockLoop
)
{
index_t
e_block_data_begin
=
0
;
// LDS double buffer: main body
// use Do-While loop instead of For loop to simplify control flow
do
{
// even iteration
b_threadwise_transfer
.
MoveSrcSliceWindow
(
b_e_n_ho_wo_global_desc
,
b_thread_slice_copy_step
);
b_threadwise_transfer
.
Run
(
b_e_n_ho_wo_global_desc
,
b_global_buf
,
b_e_n_ho_wo_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
b_thread_odd_buf
,
b_e_n_ho_wo_global_iterator_hacks
);
// LDS double buffer: GEMM on current data
// TODO: @Zhang Jing: blockwise gemm should be able to move slice window
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_even_buf
,
c_thread_buf
);
blockwise_gemm
.
MoveASliceWindow
(
a_e_k_block_desc
,
make_tuple
(
EPerBlock
,
0
));
b_threadwise_transfer
.
MoveSrcSliceWindow
(
b_e_n_ho_wo_global_desc
,
b_thread_slice_copy_step
);
b_threadwise_transfer
.
Run
(
b_e_n_ho_wo_global_desc
,
b_global_buf
,
b_e_n_ho_wo_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
b_thread_even_buf
,
b_e_n_ho_wo_global_iterator_hacks
);
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_odd_buf
,
c_thread_buf
);
blockwise_gemm
.
MoveASliceWindow
(
a_e_k_block_desc
,
make_tuple
(
EPerBlock
,
0
));
e_block_data_begin
+=
2
*
EPerBlock
;
}
while
(
e_block_data_begin
<
E
-
2
*
EPerBlock
);
}
// LDS double buffer: tail
if
constexpr
(
HasDoubleTailKBlockLoop
)
// if has 2 iteration left
{
b_threadwise_transfer
.
MoveSrcSliceWindow
(
b_e_n_ho_wo_global_desc
,
b_thread_slice_copy_step
);
b_threadwise_transfer
.
Run
(
b_e_n_ho_wo_global_desc
,
b_global_buf
,
b_e_n_ho_wo_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
b_thread_odd_buf
,
b_e_n_ho_wo_global_iterator_hacks
);
// LDS double buffer: GEMM on 2nd-last data
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_even_buf
,
c_thread_buf
);
blockwise_gemm
.
MoveASliceWindow
(
a_e_k_block_desc
,
make_tuple
(
EPerBlock
,
0
));
// LDS double buffer: GEMM on last data
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_odd_buf
,
c_thread_buf
);
}
else
// if has 1 iteration left
{
// LDS double buffer: GEMM on last data
blockwise_gemm
.
Run
(
a_block_buf
,
b_thread_even_buf
,
c_thread_buf
);
}
// output: register to global memory
{
// hack to control index calculation when iterating over c_k_n_ho_wo_global tensor
constexpr
auto
c_k_n_ho_wo_global_tensor_iterator_hacks
=
CGlobalIteratorHacks
{};
const
index_t
k_thread_data_on_global
=
k_block_data_on_global
+
k_thread_id
*
KPerThread
;
ThreadwiseDynamicTensorSliceTransfer_v1r3
<
FloatAcc
,
FloatC
,
decltype
(
c_k_n_ho_wo_thread_desc
),
decltype
(
c_k_n_ho_wo_global_desc
),
Sequence
<
KPerThread
,
1
,
HoPerThread
,
WoPerThread
>
,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferDstScalarPerVector
,
CGlobalMemoryDataOperation
,
1
,
true
>
(
c_k_n_ho_wo_global_desc
,
make_multi_index
(
k_thread_data_on_global
,
0
,
ho_thread_data_on_global
,
wo_thread_data_on_global
))
.
Run
(
c_k_n_ho_wo_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
c_thread_buf
,
c_k_n_ho_wo_global_desc
,
c_global_buf
,
c_k_n_ho_wo_global_tensor_iterator_hacks
);
}
}
// pass tensor descriptor by reference
template
<
bool
HasMainKBlockLoop
,
bool
HasDoubleTailKBlockLoop
>
__device__
void
Run
(
const
FloatAB
*
__restrict__
p_a_global
,
const
FloatAB
*
__restrict__
p_b_global
,
FloatC
*
__restrict__
p_c_global
,
integral_constant
<
bool
,
HasMainKBlockLoop
>
,
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
)
const
{
constexpr
index_t
shared_block_size
=
GetSharedMemoryNumberOfByte
()
/
sizeof
(
FloatAB
);
__shared__
FloatAB
p_shared_block
[
shared_block_size
];
Run
(
p_a_global
,
p_b_global
,
p_c_global
,
p_shared_block
,
integral_constant
<
bool
,
HasMainKBlockLoop
>
{},
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
{});
}
#if 0
// pass tensor descriptors by their pointers
template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
__device__ void Run(const AGlobalDesc* p_a_e_k_global_desc,
const FloatAB* __restrict__ p_a_global,
const BGlobalDesc* p_b_e_n_ho_wo_global_desc,
const FloatAB* __restrict__ p_b_global,
const CGlobalDesc* p_c_k_n_ho_wo_global_desc,
FloatC* __restrict__ p_c_global,
integral_constant<bool, HasMainKBlockLoop>,
integral_constant<bool, HasDoubleTailKBlockLoop>) const
{
const auto a_e_k_global_desc = *p_a_e_k_global_desc;
const auto b_e_n_ho_wo_global_desc = *p_b_e_n_ho_wo_global_desc;
const auto c_k_n_ho_wo_global_desc = *p_c_k_n_ho_wo_global_desc;
Run(a_e_k_global_desc,
p_a_global,
b_e_n_ho_wo_global_desc,
p_b_global,
c_k_n_ho_wo_global_desc,
p_c_global,
integral_constant<bool, HasMainKBlockLoop>{},
integral_constant<bool, HasDoubleTailKBlockLoop>{});
}
// pass tensor descriptors by void*
template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
__device__ void Run(const void* p_a_e_k_global_desc,
const FloatAB* __restrict__ p_a_global,
const void* p_b_e_n_ho_wo_global_desc,
const FloatAB* __restrict__ p_b_global,
const void* p_c_k_n_ho_wo_global_desc,
FloatC* __restrict__ p_c_global,
integral_constant<bool, HasMainKBlockLoop>,
integral_constant<bool, HasDoubleTailKBlockLoop>) const
{
const auto a_e_k_global_desc = *reinterpret_cast<const AGlobalDesc*>(p_a_e_k_global_desc);
const auto b_e_n_ho_wo_global_desc =
*reinterpret_cast<const BGlobalDesc*>(p_b_e_n_ho_wo_global_desc);
const auto c_k_n_ho_wo_global_desc =
*reinterpret_cast<const CGlobalDesc*>(p_c_k_n_ho_wo_global_desc);
Run(a_e_k_global_desc,
p_a_global,
b_e_n_ho_wo_global_desc,
p_b_global,
c_k_n_ho_wo_global_desc,
p_c_global,
integral_constant<bool, HasMainKBlockLoop>{},
integral_constant<bool, HasDoubleTailKBlockLoop>{});
}
#endif
};
}
// namespace ck
#endif
composable_kernel/include/utility/config.hpp
View file @
78740e43
...
@@ -16,9 +16,9 @@
...
@@ -16,9 +16,9 @@
// GPU ID
// GPU ID
#if 0
#if 0
#define CK_AMD_GPU_GFX906 1
#define CK_AMD_GPU_GFX906 1
#elif
1
#define CK_AMD_GPU_GFX908 1
#elif
0
#elif
0
#define CK_AMD_GPU_GFX908 1
#elif 1
#define CK_AMD_GPU_GFX1030 1
#define CK_AMD_GPU_GFX1030 1
#endif
#endif
...
@@ -93,7 +93,7 @@
...
@@ -93,7 +93,7 @@
// experimental implementation
// experimental implementation
#ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
#ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
0
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
1
#endif
#endif
#ifndef CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
#ifndef CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
...
...
host/driver_offline/conv_fwd_driver_offline.cpp
View file @
78740e43
...
@@ -15,15 +15,15 @@
...
@@ -15,15 +15,15 @@
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v6r1_nchw_kcyx_nkhw.hpp"
#include "device_
dynam
ic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp"
#include "device_
stat
ic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
#define USE_DYNAMIC_MODE
1
#define USE_DYNAMIC_MODE
0
#define USE_CONV_FWD_V4R4_NCHW
1
#define USE_CONV_FWD_V4R4_NCHW
0
#define USE_CONV_FWD_V4R4R2_NHWC 0
#define USE_CONV_FWD_V4R4R2_NHWC 0
#define USE_CONV_FWD_V6R1_NCHW 0
#define USE_CONV_FWD_V6R1_NCHW 0
#define USE_CONV_FWD_V5R1_NCHW
0
#define USE_CONV_FWD_V5R1_NCHW
1
#define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
#define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
#define USE_CONV_FWD_V4R4R4_XDL_NHWC 0
#define USE_CONV_FWD_V4R4R4_XDL_NHWC 0
...
@@ -102,16 +102,16 @@ int main(int argc, char* argv[])
...
@@ -102,16 +102,16 @@ int main(int argc, char* argv[])
const
bool
do_log
=
atoi
(
argv
[
5
]);
const
bool
do_log
=
atoi
(
argv
[
5
]);
const
int
nrepeat
=
atoi
(
argv
[
6
]);
const
int
nrepeat
=
atoi
(
argv
[
6
]);
constexpr
index_t
N
=
1
28
;
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
1
92
;
constexpr
index_t
C
=
1
6
;
constexpr
index_t
Hi
=
71
;
constexpr
index_t
Hi
=
540
;
constexpr
index_t
Wi
=
71
;
constexpr
index_t
Wi
=
960
;
constexpr
index_t
K
=
25
6
;
constexpr
index_t
K
=
1
6
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
constexpr
index_t
X
=
3
;
const
index_t
conv_stride_h
=
2
;
const
index_t
conv_stride_h
=
1
;
const
index_t
conv_stride_w
=
2
;
const
index_t
conv_stride_w
=
1
;
const
index_t
conv_dilation_h
=
1
;
const
index_t
conv_dilation_h
=
1
;
const
index_t
conv_dilation_w
=
1
;
const
index_t
conv_dilation_w
=
1
;
const
index_t
in_left_pad_h
=
1
;
const
index_t
in_left_pad_h
=
1
;
...
@@ -126,7 +126,7 @@ int main(int argc, char* argv[])
...
@@ -126,7 +126,7 @@ int main(int argc, char* argv[])
const
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
1
;
const
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
XEff
)
/
conv_stride_w
+
1
;
#endif
#endif
#if
1
#if
0
using in_data_t = float;
using in_data_t = float;
using acc_data_t = float;
using acc_data_t = float;
using out_data_t = float;
using out_data_t = float;
...
@@ -380,20 +380,20 @@ int main(int argc, char* argv[])
...
@@ -380,20 +380,20 @@ int main(int argc, char* argv[])
const
auto
tmp
=
f_make_for_device_nchw
();
const
auto
tmp
=
f_make_for_device_nchw
();
device_
dynam
ic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw
<
in_data_t
,
device_
stat
ic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw
<
in_data_t
,
16
,
8
,
acc_data_t
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I2
],
tmp
[
I3
],
tmp
[
I3
],
tmp
[
I4
],
tmp
[
I4
],
tmp
[
I5
],
tmp
[
I5
],
tmp
[
I6
],
tmp
[
I6
],
in
,
in
,
wei
,
wei
,
out_device
,
out_device
,
nrepeat
);
nrepeat
);
}
}
#endif
#endif
...
...
host/driver_offline/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
View file @
78740e43
...
@@ -96,25 +96,25 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
...
@@ -96,25 +96,25 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
// cdata = 64, BlockSize = 64, 16x8x32x4
// cdata = 64, BlockSize = 64, 16x8x32x4
constexpr
index_t
BlockSize
=
64
;
constexpr
index_t
BlockSize
=
64
;
constexpr
index_t
KPerBlock
=
16
;
constexpr
index_t
KPerBlock
=
K
;
constexpr
index_t
HoPerBlock
=
8
;
constexpr
index_t
HoPerBlock
=
8
;
constexpr
index_t
WoPerBlock
=
32
;
constexpr
index_t
WoPerBlock
=
32
;
constexpr
index_t
EPerBlock
=
1
;
constexpr
index_t
EPerBlock
=
C0
;
constexpr
index_t
KPerThread
=
KPerBlock
;
constexpr
index_t
KPerThread
=
KPerBlock
;
constexpr
index_t
HoPerThread
=
2
;
constexpr
index_t
HoPerThread
=
2
;
constexpr
index_t
WoPerThread
=
2
;
constexpr
index_t
WoPerThread
=
2
;
constexpr
index_t
EPerThread
=
EPerBlock
;
constexpr
index_t
EPerThread
=
EPerBlock
;
using
ABlockTransferThreadSliceLengths_E_K
=
Sequence
<
3
,
1
>
;
using
ABlockTransferThreadSliceLengths_E_K
=
Sequence
<
9
,
1
>
;
using
ABlockTransferThreadClusterLengths_E_K
=
Sequence
<
3
*
EPerBlock
,
KPerBlock
>
;
using
ABlockTransferThreadClusterLengths_E_K
=
Sequence
<
EPerBlock
,
KPerBlock
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_E
=
1
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_E
=
1
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K
=
1
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K
=
1
;
constexpr
index_t
BThreadTransferSrcScalarPerVector_W
=
1
;
constexpr
index_t
BThreadTransferSrcScalarPerVector_W
=
1
;
constexpr
index_t
CThreadTransferDstScalarPerVector_W
=
16
;
constexpr
index_t
CThreadTransferDstScalarPerVector_W
=
8
;
static_assert
(
KPerThread
%
CThreadTransferDstScalarPerVector_W
==
0
,
""
);
static_assert
(
KPerThread
%
CThreadTransferDstScalarPerVector_W
==
0
,
""
);
#else
#else
...
...
host/driver_offline/include/device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
0 → 100644
View file @
78740e43
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
//#include "driver_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp"
#include "driver_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp"
template
<
typename
TInWei
,
ck
::
index_t
InWeiVectorSize
,
typename
TAcc
,
typename
TOut
,
typename
InLengths
,
typename
WeiLengths
,
typename
OutLengths
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
device_static_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw
(
const
InLengths
&
in_n_c_hi_wi_lengths
,
const
WeiLengths
&
wei_k_c_y_x_lengths
,
const
OutLengths
&
out_n_k_ho_wo_lengths
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads
,
const
Tensor
<
TInWei
>&
in_n_c_hi_wi
,
const
Tensor
<
TInWei
>&
wei_k_c_y_x
,
Tensor
<
TOut
>&
out_n_k_ho_wo
,
ck
::
index_t
nrepeat
)
{
using
namespace
ck
;
std
::
cout
<<
"device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw"
<<
std
::
endl
;
DeviceMem
in_n_c_hi_wi_device_buf
(
sizeof
(
TInWei
)
*
in_n_c_hi_wi
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_k_c_y_x_device_buf
(
sizeof
(
TInWei
)
*
wei_k_c_y_x
.
mDesc
.
GetElementSpace
());
DeviceMem
out_n_k_ho_wo_device_buf
(
sizeof
(
TOut
)
*
out_n_k_ho_wo
.
mDesc
.
GetElementSpace
());
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
N
=
OutLengths
{}[
I0
];
constexpr
auto
K
=
OutLengths
{}[
I1
];
constexpr
auto
C
=
WeiLengths
{}[
I1
];
constexpr
auto
Hi
=
InLengths
{}[
I2
];
constexpr
auto
Wi
=
InLengths
{}[
I3
];
constexpr
auto
Ho
=
OutLengths
{}[
I2
];
constexpr
auto
Wo
=
OutLengths
{}[
I3
];
constexpr
auto
Y
=
WeiLengths
{}[
I2
];
constexpr
auto
X
=
WeiLengths
{}[
I3
];
constexpr
auto
C0
=
C
/
Number
<
InWeiVectorSize
>
{};
constexpr
auto
C1
=
Number
<
InWeiVectorSize
>
{};
constexpr
auto
K0
=
K
/
Number
<
InWeiVectorSize
>
{};
constexpr
auto
K1
=
Number
<
InWeiVectorSize
>
{};
#if 0
// run-time variables
const auto in_n_c_hi_wi_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(to_multi_index(InDesc::GetLengths()));
const auto wei_k_c_y_x_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(to_multi_index(WeiDesc::GetLengths()));
const auto out_n_k_ho_wo_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(to_multi_index(OutDesc::GetLengths()));
const auto conv_strides = to_multi_index(ConvStrides{});
const auto conv_dilations = to_multi_index(ConvDilations{});
const auto in_left_pads = to_multi_index(InLeftPads{});
const auto in_right_pads = to_multi_index(InRightPads{});
#else
// compile-time variables
constexpr
auto
in_n_c0_hi_wi_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
Number
<
N
>
{},
Number
<
C0
>
{},
Number
<
Hi
>
{},
Number
<
Wi
>
{}));
constexpr
auto
wei_k_c0_y_x_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
Number
<
K
>
{},
Number
<
C0
>
{},
Number
<
Y
>
{},
Number
<
X
>
{}));
constexpr
auto
out_n_k0_ho_wo_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
Number
<
N
>
{},
Number
<
K0
>
{},
Number
<
Ho
>
{},
Number
<
Wo
>
{}));
// constexpr auto conv_strides = sequence_to_tuple_of_number(ConvStrides{});
// constexpr auto conv_dilations = sequence_to_tuple_of_number(ConvDilations{});
// constexpr auto in_left_pads = sequence_to_tuple_of_number(InLeftPads{});
// constexpr auto in_right_pads = sequence_to_tuple_of_number(InRightPads{});
#endif
Tensor
<
TInWei
>
in_n_c0_hi_wi_c1
(
HostTensorDescriptor
(
std
::
initializer_list
<
index_t
>
{
N
,
C0
,
Hi
,
Wi
,
C1
}));
Tensor
<
TInWei
>
wei_k_c0_y_x_c1
(
HostTensorDescriptor
(
std
::
initializer_list
<
index_t
>
{
K
,
C0
,
Y
,
X
,
C1
}));
Tensor
<
TOut
>
out_n_k0_ho_wo_k1
(
HostTensorDescriptor
(
std
::
initializer_list
<
index_t
>
{
N
,
K0
,
Ho
,
Wo
,
K1
}));
auto
f_nchw2nc0hwc1
=
[
&
](
auto
n
,
auto
hi
,
auto
wi
,
auto
c
)
{
in_n_c0_hi_wi_c1
(
n
,
c
/
InWeiVectorSize
,
hi
,
wi
,
c
%
InWeiVectorSize
)
=
in_n_c_hi_wi
(
n
,
c
,
hi
,
wi
);
};
auto
f_kcyx2kc0yxc1
=
[
&
](
auto
k
,
auto
y
,
auto
x
,
auto
c
)
{
wei_k_c0_y_x_c1
(
k
,
c
/
InWeiVectorSize
,
y
,
x
,
c
%
InWeiVectorSize
)
=
wei_k_c_y_x
(
k
,
c
,
y
,
x
);
};
make_ParallelTensorFunctor
(
f_nchw2nc0hwc1
,
N
,
Hi
,
Wi
,
C
)();
make_ParallelTensorFunctor
(
f_kcyx2kc0yxc1
,
K
,
Y
,
X
,
C
)();
in_n_c_hi_wi_device_buf
.
ToDevice
(
in_n_c0_hi_wi_c1
.
mData
.
data
());
wei_k_c_y_x_device_buf
.
ToDevice
(
wei_k_c0_y_x_c1
.
mData
.
data
());
// cdata = 64, BlockSize = 64, 16x8x32x4
constexpr
index_t
BlockSize
=
64
;
constexpr
index_t
KPerBlock
=
K
;
constexpr
index_t
HoPerBlock
=
8
;
constexpr
index_t
WoPerBlock
=
32
;
constexpr
index_t
EPerBlock
=
C0
;
constexpr
index_t
KPerThread
=
KPerBlock
;
constexpr
index_t
HoPerThread
=
2
;
constexpr
index_t
WoPerThread
=
2
;
constexpr
index_t
EPerThread
=
EPerBlock
;
using
ABlockTransferThreadSliceLengths_E_K
=
Sequence
<
9
,
1
>
;
using
ABlockTransferThreadClusterLengths_E_K
=
Sequence
<
EPerBlock
,
KPerBlock
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_E
=
1
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K
=
1
;
constexpr
index_t
BThreadTransferSrcScalarPerVector_W
=
1
;
constexpr
index_t
CThreadTransferDstScalarPerVector_W
=
1
;
std
::
cerr
<<
"conv_fp16_nchwc"
<<
C1
<<
"_n"
<<
N
<<
"c"
<<
C
<<
"h"
<<
Hi
<<
"w"
<<
Wi
<<
"-k"
<<
K
<<
"c"
<<
C
<<
"y"
<<
Y
<<
"x"
<<
X
<<
"-u"
<<
conv_strides
[
I0
]
<<
"v"
<<
conv_strides
[
I1
]
<<
"l"
<<
conv_dilations
[
I0
]
<<
"j"
<<
conv_dilations
[
I1
]
<<
"q"
<<
in_left_pads
[
I0
]
<<
"p"
<<
in_right_pads
[
I0
]
<<
std
::
endl
;
constexpr
auto
conv_driver
=
#if 0
DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
#else
DriverStaticConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
#endif
<
BlockSize
,
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
,
TAcc
,
typename
vector_type
<
TOut
,
InWeiVectorSize
>::
type
,
KPerBlock
,
HoPerBlock
,
WoPerBlock
,
EPerBlock
,
KPerThread
,
HoPerThread
,
WoPerThread
,
EPerThread
,
ABlockTransferThreadSliceLengths_E_K
,
ABlockTransferThreadClusterLengths_E_K
,
ABlockTransferSrcScalarPerVector_E
,
ABlockTransferDstScalarPerVector_K
,
BThreadTransferSrcScalarPerVector_W
,
CThreadTransferDstScalarPerVector_W
>
{};
conv_driver
.
Run
(
wei_k_c0_y_x_desc
,
in_n_c0_hi_wi_desc
,
out_n_k0_ho_wo_desc
,
conv_strides
,
conv_dilations
,
in_left_pads
,
in_right_pads
,
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
wei_k_c_y_x_device_buf
.
GetDeviceBuffer
()),
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
in_n_c_hi_wi_device_buf
.
GetDeviceBuffer
()),
static_cast
<
typename
vector_type
<
TOut
,
InWeiVectorSize
>::
type
*>
(
out_n_k_ho_wo_device_buf
.
GetDeviceBuffer
()));
out_n_k_ho_wo_device_buf
.
FromDevice
(
out_n_k0_ho_wo_k1
.
mData
.
data
());
#if 1
auto
f_nk0hwk1_to_nkhw
=
[
&
](
auto
n
,
auto
k
,
auto
ho
,
auto
wo
)
{
out_n_k_ho_wo
(
n
,
k
,
ho
,
wo
)
=
out_n_k0_ho_wo_k1
(
n
,
k
/
InWeiVectorSize
,
ho
,
wo
,
k
%
InWeiVectorSize
);
};
make_ParallelTensorFunctor
(
f_nk0hwk1_to_nkhw
,
N
,
K
,
Ho
,
Wo
)();
#endif
}
script/cmake-rocm.sh
View file @
78740e43
...
@@ -3,13 +3,13 @@ rm -f CMakeCache.txt
...
@@ -3,13 +3,13 @@ rm -f CMakeCache.txt
rm
-f
*
.cmake
rm
-f
*
.cmake
rm
-rf
CMakeFiles
rm
-rf
CMakeFiles
MY_PROJECT_SOURCE
=
../
../../
MY_PROJECT_SOURCE
=
../
MY_PROJECT_INSTALL
=
../install.dir
MY_PROJECT_INSTALL
=
../install.dir
cmake
\
cmake
\
-D
CMAKE_INSTALL_PREFIX
=
${
MY_PROJECT_INSTALL
}
\
-D
CMAKE_INSTALL_PREFIX
=
${
MY_PROJECT_INSTALL
}
\
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
CMAKE_CXX_FLAGS
=
"-O3 --amdgpu-target=gfx
908
-mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=
$PWD
"
\
-D
CMAKE_CXX_FLAGS
=
"-O3 --amdgpu-target=gfx
1030
-mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=
$PWD
"
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_PREFIX_PATH
=
/opt/rocm
\
-D
CMAKE_PREFIX_PATH
=
/opt/rocm
\
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
...
...
script/run.sh
View file @
78740e43
...
@@ -13,8 +13,8 @@
...
@@ -13,8 +13,8 @@
#export OLC_DEBUG_SAVE_TEMP_DIR=1
#export OLC_DEBUG_SAVE_TEMP_DIR=1
make
-j
conv_fwd_driver_offline
make
-j
conv_fwd_driver_offline
make
-j
conv_bwd_driver_offline
#
make -j conv_bwd_driver_offline
make
-j
conv_fwd_driver_online
#
make -j conv_fwd_driver_online
#rm -rf /root/_hip_binary_kernels_/
#rm -rf /root/_hip_binary_kernels_/
#rm -rf /tmp/olCompile*
#rm -rf /tmp/olCompile*
...
@@ -26,11 +26,13 @@ INIT=$4
...
@@ -26,11 +26,13 @@ INIT=$4
LOG
=
$5
LOG
=
$5
REPEAT
=
$6
REPEAT
=
$6
./host/driver_offline/conv_fwd_driver_offline
$LAYOUT
$ALGO
$VERIFY
$INIT
$LOG
$REPEAT
################################################ layout algo verify init log repeat N__ K___ C___ Y X Hi_ Wi__ Strides Dilations LeftPads RightPads
################################################ layout algo verify init log repeat N__ K___ C___ Y X Hi_ Wi__ Strides Dilations LeftPads RightPads
#./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 128 192 3 3 71 71 2 2 1 1 1 1 1 1
#./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 128 192 3 3 71 71 2 2 1 1 1 1 1 1
#./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 256 192 3 3 71 71 2 2 1 1 1 1 1 1
#./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 256 192 3 3 71 71 2 2 1 1 1 1 1 1
#./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 256 1024 1 7 17 17 1 1 1 1 0 3 0 3
#./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 256 1024 1 7 17 17 1 1 1 1 0 3 0 3
./host/driver_offline/conv_fwd_driver_offline
$LAYOUT
$ALGO
$VERIFY
$INIT
$LOG
$REPEAT
256 256 256 3 3 14 14 1 1 1 1 1 1 1 1
#
./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 256 256 256 3 3 14 14 1 1 1 1 1 1 1 1
#./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 128 128 3 3 14 14 1 1 1 1 1 1 1 1
#./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 128 128 3 3 14 14 1 1 1 1 1 1 1 1
#./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 256 512 512 3 3 7 7 1 1 1 1 1 1 1 1
#./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 256 512 512 3 3 7 7 1 1 1 1 1 1 1 1
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment