Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
4e57b30a
Commit
4e57b30a
authored
Aug 11, 2021
by
Chao Liu
Browse files
rename
parent
c03045ce
Changes
30
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
140 additions
and
140 deletions
+140
-140
host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp
...on_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp
+15
-15
host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp
...on_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp
+15
-15
host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
...on_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
+15
-15
host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
...ution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
+15
-15
host/driver_offline/include/driver_contraction_dlops_v1r2.hpp
.../driver_offline/include/driver_contraction_dlops_v1r2.hpp
+15
-15
host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
...ution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+10
-10
host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
...orward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
+10
-10
host/driver_offline/include/driver_gemm_dlops_v1r2.hpp
host/driver_offline/include/driver_gemm_dlops_v1r2.hpp
+15
-15
host/driver_offline/include/driver_gemm_dlops_v1r3.hpp
host/driver_offline/include/driver_gemm_dlops_v1r3.hpp
+15
-15
host/driver_offline/include/driver_gemm_xdlops_v2r3.hpp
host/driver_offline/include/driver_gemm_xdlops_v2r3.hpp
+15
-15
No files found.
host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp
View file @
4e57b30a
...
...
@@ -121,12 +121,12 @@ void device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk(
const
auto
out_gemmm_gemmn_grid_desc
=
descs
[
I2
];
// HACK: hacks that control index calculation when iterating over A, B, C matrix
constexpr
auto
wei_gemmk0_gemmm_gemmk1_grid_
i
te
rator
_hacks
=
make_tuple
(
constexpr
auto
wei_gemmk0_gemmm_gemmk1_grid_
s
te
p
_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}));
constexpr
auto
in_gemmk0_gemmn_gemmk1_grid_
i
te
rator
_hacks
=
constexpr
auto
in_gemmk0_gemmn_gemmk1_grid_
s
te
p
_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
>
{}),
...
...
@@ -134,7 +134,7 @@ void device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
>
{}));
constexpr
auto
out_m0_m1_m2_n_grid_
i
te
rator
_hacks
=
constexpr
auto
out_m0_m1_m2_n_grid_
s
te
p
_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
...
...
@@ -144,10 +144,10 @@ void device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
2
,
0
,
0
>
{}));
constexpr
auto
wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_
i
te
rator
_hacks
=
constexpr
auto
wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_
s
te
p
_hacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
>
{};
constexpr
auto
in_gemmk0_gemmn_gemmk1_grid_move_slice_window_
i
te
rator
_hacks
=
constexpr
auto
in_gemmk0_gemmn_gemmk1_grid_move_slice_window_
s
te
p
_hacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
2
,
0
,
0
>
{};
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
...
...
@@ -187,22 +187,22 @@ void device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk(
Sequence
<
2
,
3
,
0
,
1
>
,
2
,
GemmCThreadTransferDstScalarPerVector
,
decltype
(
wei_gemmk0_gemmm_gemmk1_grid_
i
te
rator
_hacks
),
decltype
(
in_gemmk0_gemmn_gemmk1_grid_
i
te
rator
_hacks
),
decltype
(
out_m0_m1_m2_n_grid_
i
te
rator
_hacks
),
decltype
(
wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_
i
te
rator
_hacks
),
decltype
(
in_gemmk0_gemmn_gemmk1_grid_move_slice_window_
i
te
rator
_hacks
)
>
(
decltype
(
wei_gemmk0_gemmm_gemmk1_grid_
s
te
p
_hacks
),
decltype
(
in_gemmk0_gemmn_gemmk1_grid_
s
te
p
_hacks
),
decltype
(
out_m0_m1_m2_n_grid_
s
te
p
_hacks
),
decltype
(
wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_
s
te
p
_hacks
),
decltype
(
in_gemmk0_gemmn_gemmk1_grid_move_slice_window_
s
te
p
_hacks
)
>
(
static_cast
<
TInWei
*>
(
wei_k_y_x_c_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TInWei
*>
(
in_n_hi_wi_c_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
out_n_ho_wo_k_device_buf
.
GetDeviceBuffer
()),
wei_gemmk0_gemmm_gemmk1_grid_desc
,
in_gemmk0_gemmn_gemmk1_grid_desc
,
out_gemmm_gemmn_grid_desc
,
wei_gemmk0_gemmm_gemmk1_grid_
i
te
rator
_hacks
,
in_gemmk0_gemmn_gemmk1_grid_
i
te
rator
_hacks
,
out_m0_m1_m2_n_grid_
i
te
rator
_hacks
,
wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_
i
te
rator
_hacks
,
in_gemmk0_gemmn_gemmk1_grid_move_slice_window_
i
te
rator
_hacks
,
wei_gemmk0_gemmm_gemmk1_grid_
s
te
p
_hacks
,
in_gemmk0_gemmn_gemmk1_grid_
s
te
p
_hacks
,
out_m0_m1_m2_n_grid_
s
te
p
_hacks
,
wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_
s
te
p
_hacks
,
in_gemmk0_gemmn_gemmk1_grid_move_slice_window_
s
te
p
_hacks
,
nrepeat
);
{
...
...
host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp
View file @
4e57b30a
...
...
@@ -182,12 +182,12 @@ void device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk(
const
auto
out_gemmm_gemmn_grid_desc
=
descs
[
I2
];
// HACK: hacks that control index calculation when iterating over A, B, C matrix
constexpr
auto
wei_gemmk0_gemmm_gemmk1_grid_
i
te
rator
_hacks
=
make_tuple
(
constexpr
auto
wei_gemmk0_gemmm_gemmk1_grid_
s
te
p
_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}));
constexpr
auto
in_gemmk0_gemmn_gemmk1_grid_
i
te
rator
_hacks
=
constexpr
auto
in_gemmk0_gemmn_gemmk1_grid_
s
te
p
_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
>
{}),
...
...
@@ -195,7 +195,7 @@ void device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
>
{}));
constexpr
auto
out_m0_m1_m2_n_grid_
i
te
rator
_hacks
=
constexpr
auto
out_m0_m1_m2_n_grid_
s
te
p
_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
1
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
...
...
@@ -213,10 +213,10 @@ void device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
2
,
0
,
0
>
{}));
constexpr
auto
wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_
i
te
rator
_hacks
=
constexpr
auto
wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_
s
te
p
_hacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
>
{};
constexpr
auto
in_gemmk0_gemmn_gemmk1_grid_move_slice_window_
i
te
rator
_hacks
=
constexpr
auto
in_gemmk0_gemmn_gemmk1_grid_move_slice_window_
s
te
p
_hacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
2
,
0
,
0
>
{};
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
...
...
@@ -256,11 +256,11 @@ void device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk(
Sequence
<
2
,
3
,
0
,
1
,
7
,
5
,
4
,
6
>
,
6
,
GemmCThreadTransferDstScalarPerVector
,
decltype
(
wei_gemmk0_gemmm_gemmk1_grid_
i
te
rator
_hacks
),
decltype
(
in_gemmk0_gemmn_gemmk1_grid_
i
te
rator
_hacks
),
decltype
(
out_m0_m1_m2_n_grid_
i
te
rator
_hacks
),
decltype
(
wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_
i
te
rator
_hacks
),
decltype
(
in_gemmk0_gemmn_gemmk1_grid_move_slice_window_
i
te
rator
_hacks
),
decltype
(
wei_gemmk0_gemmm_gemmk1_grid_
s
te
p
_hacks
),
decltype
(
in_gemmk0_gemmn_gemmk1_grid_
s
te
p
_hacks
),
decltype
(
out_m0_m1_m2_n_grid_
s
te
p
_hacks
),
decltype
(
wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_
s
te
p
_hacks
),
decltype
(
in_gemmk0_gemmn_gemmk1_grid_move_slice_window_
s
te
p
_hacks
),
false
// CAccessOrderMRepeatNRepeat
>
(
static_cast
<
TInWei
*>
(
wei_k_y_x_c_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TInWei
*>
(
in_n_hi_wi_c_device_buf
.
GetDeviceBuffer
()),
...
...
@@ -268,11 +268,11 @@ void device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk(
wei_gemmk0_gemmm_gemmk1_grid_desc
,
in_gemmk0_gemmn_gemmk1_grid_desc
,
out_gemmm_gemmn_grid_desc
,
wei_gemmk0_gemmm_gemmk1_grid_
i
te
rator
_hacks
,
in_gemmk0_gemmn_gemmk1_grid_
i
te
rator
_hacks
,
out_m0_m1_m2_n_grid_
i
te
rator
_hacks
,
wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_
i
te
rator
_hacks
,
in_gemmk0_gemmn_gemmk1_grid_move_slice_window_
i
te
rator
_hacks
,
wei_gemmk0_gemmm_gemmk1_grid_
s
te
p
_hacks
,
in_gemmk0_gemmn_gemmk1_grid_
s
te
p
_hacks
,
out_m0_m1_m2_n_grid_
s
te
p
_hacks
,
wei_gemmk0_gemmm_gemmk1_grid_move_slice_window_
s
te
p
_hacks
,
in_gemmk0_gemmn_gemmk1_grid_move_slice_window_
s
te
p
_hacks
,
nrepeat
);
{
...
...
host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
View file @
4e57b30a
...
...
@@ -233,7 +233,7 @@ void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk(
const
auto
out_gemmm_gemmn_grid_desc
=
descs
[
I2
];
// HACK: hacks that control index calculation when iterating over A, B, C matrix
constexpr
auto
in_gemmk0_gemmm_gemmk1_grid_
i
te
rator
_hacks
=
constexpr
auto
in_gemmk0_gemmm_gemmk1_grid_
s
te
p
_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
>
{},
// 0+: GemmK0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
>
{},
// 1+: GemmM
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
>
{}),
// 2+: GemmK1
...
...
@@ -241,7 +241,7 @@ void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
>
{},
// 1-: GemmM
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
>
{}));
// 2-: GemmK1
constexpr
auto
wei_gemmk0_gemmn_gemmk1_grid_
i
te
rator
_hacks
=
constexpr
auto
wei_gemmk0_gemmn_gemmk1_grid_
s
te
p
_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
// 0+: GemmK0
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
// 1+: GemmN
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}),
// 2+: GemmK1
...
...
@@ -249,7 +249,7 @@ void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
// 1-: GemmN
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}));
// 2-: GemmK1
constexpr
auto
out_m0_m1_m2_n_grid_
i
te
rator
_hacks
=
constexpr
auto
out_m0_m1_m2_n_grid_
s
te
p
_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
// 0+: MRepeat
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
// 1+: NRepeat
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
// 2+: MWaves
...
...
@@ -267,10 +267,10 @@ void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
// 6-: M2
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}));
// 7-: N1
constexpr
auto
in_gemmk0_gemmm_gemmk1_grid_move_slice_window_
i
te
rator
_hacks
=
constexpr
auto
in_gemmk0_gemmm_gemmk1_grid_move_slice_window_
s
te
p
_hacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
2
,
0
,
0
>
{};
constexpr
auto
wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_
i
te
rator
_hacks
=
constexpr
auto
wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_
s
te
p
_hacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
>
{};
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
...
...
@@ -311,11 +311,11 @@ void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk(
Sequence
<
2
,
3
,
0
,
1
,
7
,
5
,
4
,
6
>
,
7
,
GemmCThreadTransferDstScalarPerVector
,
decltype
(
in_gemmk0_gemmm_gemmk1_grid_
i
te
rator
_hacks
),
decltype
(
wei_gemmk0_gemmn_gemmk1_grid_
i
te
rator
_hacks
),
decltype
(
out_m0_m1_m2_n_grid_
i
te
rator
_hacks
),
decltype
(
in_gemmk0_gemmm_gemmk1_grid_move_slice_window_
i
te
rator
_hacks
),
decltype
(
wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_
i
te
rator
_hacks
),
decltype
(
in_gemmk0_gemmm_gemmk1_grid_
s
te
p
_hacks
),
decltype
(
wei_gemmk0_gemmn_gemmk1_grid_
s
te
p
_hacks
),
decltype
(
out_m0_m1_m2_n_grid_
s
te
p
_hacks
),
decltype
(
in_gemmk0_gemmm_gemmk1_grid_move_slice_window_
s
te
p
_hacks
),
decltype
(
wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_
s
te
p
_hacks
),
false
// CAccessOrderMRepeatNRepeat
>
(
static_cast
<
TInWei
*>
(
in_n_hi_wi_c_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TInWei
*>
(
wei_k_y_x_c_device_buf
.
GetDeviceBuffer
()),
...
...
@@ -323,11 +323,11 @@ void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk(
in_gemmk0_gemmm_gemmk1_grid_desc
,
wei_gemmk0_gemmn_gemmk1_grid_desc
,
out_gemmm_gemmn_grid_desc
,
in_gemmk0_gemmm_gemmk1_grid_
i
te
rator
_hacks
,
wei_gemmk0_gemmn_gemmk1_grid_
i
te
rator
_hacks
,
out_m0_m1_m2_n_grid_
i
te
rator
_hacks
,
in_gemmk0_gemmm_gemmk1_grid_move_slice_window_
i
te
rator
_hacks
,
wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_
i
te
rator
_hacks
,
in_gemmk0_gemmm_gemmk1_grid_
s
te
p
_hacks
,
wei_gemmk0_gemmn_gemmk1_grid_
s
te
p
_hacks
,
out_m0_m1_m2_n_grid_
s
te
p
_hacks
,
in_gemmk0_gemmm_gemmk1_grid_move_slice_window_
s
te
p
_hacks
,
wei_gemmk0_gemmn_gemmk1_grid_move_slice_window_
s
te
p
_hacks
,
nrepeat
);
{
...
...
host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
View file @
4e57b30a
...
...
@@ -130,7 +130,7 @@ void device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
const
auto
out_grid_desc_gm0_gm1_gn0_gn1
=
descs
[
I2
];
// HACK: hacks that control index calculation when iterating over A, B, C matrix
constexpr
auto
wei_grid_
i
te
rator
_hacks
=
constexpr
auto
wei_grid_
s
te
p
_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 0+: GK0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 1+: GM0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 2+: GM10
...
...
@@ -142,7 +142,7 @@ void device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 3-: GM11
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}));
// 4-: GK1
constexpr
auto
in_grid_
i
te
rator
_hacks
=
make_tuple
(
constexpr
auto
in_grid_
s
te
p
_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 0+: GK0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
>
{},
// 1+: GN0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
>
{},
// 2+: GN10
...
...
@@ -154,7 +154,7 @@ void device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
0
>
{},
// 3-: GN11
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}));
// 4-: GK1
constexpr
auto
out_grid_
i
te
rator
_hacks
=
make_tuple
(
constexpr
auto
out_grid_
s
te
p
_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 0+: GM10
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 1+: BM0
...
...
@@ -170,9 +170,9 @@ void device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
>
{},
// 4-: BN0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
>
{}));
// 5-: GN1
constexpr
auto
wei_grid_move_slice_window_
i
te
rator
_hacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{};
constexpr
auto
wei_grid_move_slice_window_
s
te
p
_hacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{};
constexpr
auto
in_grid_move_slice_window_
i
te
rator
_hacks
=
constexpr
auto
in_grid_move_slice_window_
s
te
p
_hacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
2
,
0
,
0
,
0
,
0
,
0
>
{};
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
...
...
@@ -211,22 +211,22 @@ void device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
Sequence
<
3
,
4
,
5
,
0
,
1
,
2
>
,
// CThreadTransferSrcDstAccessOrder
5
,
// CThreadTransferSrcDstVectorDim
CThreadTransferDstScalarPerVector_BN1
,
decltype
(
wei_grid_
i
te
rator
_hacks
),
decltype
(
in_grid_
i
te
rator
_hacks
),
decltype
(
out_grid_
i
te
rator
_hacks
),
decltype
(
wei_grid_move_slice_window_
i
te
rator
_hacks
),
decltype
(
in_grid_move_slice_window_
i
te
rator
_hacks
)
>
(
decltype
(
wei_grid_
s
te
p
_hacks
),
decltype
(
in_grid_
s
te
p
_hacks
),
decltype
(
out_grid_
s
te
p
_hacks
),
decltype
(
wei_grid_move_slice_window_
s
te
p
_hacks
),
decltype
(
in_grid_move_slice_window_
s
te
p
_hacks
)
>
(
static_cast
<
TInWei
*>
(
wei_k_c_y_x_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TInWei
*>
(
in_n_c_hi_wi_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
out_n_k_ho_wo_device_buf
.
GetDeviceBuffer
()),
wei_grid_desc_gk0_gm0_gm1_gk1
,
in_grid_desc_gk0_gn0_gn1_gk1
,
out_grid_desc_gm0_gm1_gn0_gn1
,
wei_grid_
i
te
rator
_hacks
,
in_grid_
i
te
rator
_hacks
,
out_grid_
i
te
rator
_hacks
,
wei_grid_move_slice_window_
i
te
rator
_hacks
,
in_grid_move_slice_window_
i
te
rator
_hacks
,
wei_grid_
s
te
p
_hacks
,
in_grid_
s
te
p
_hacks
,
out_grid_
s
te
p
_hacks
,
wei_grid_move_slice_window_
s
te
p
_hacks
,
in_grid_move_slice_window_
s
te
p
_hacks
,
nrepeat
);
float
perf
=
static_cast
<
float
>
(
calculate_convolution_flops
(
...
...
host/driver_offline/include/driver_contraction_dlops_v1r2.hpp
View file @
4e57b30a
...
...
@@ -39,11 +39,11 @@ template <ck::index_t BlockSize,
typename
CThreadTransferSrcDstAccessOrder
,
ck
::
index_t
CThreadTransferSrcDstVectorDim
,
ck
::
index_t
CThreadTransferDstScalarPerVector
,
typename
AGrid
I
te
rator
Hacks
,
typename
BGrid
I
te
rator
Hacks
,
typename
CGrid
I
te
rator
Hacks
,
typename
AGridMoveSliceWindow
I
te
rator
Hacks
,
typename
BGridMoveSliceWindow
I
te
rator
Hacks
>
typename
AGrid
S
te
p
Hacks
,
typename
BGrid
S
te
p
Hacks
,
typename
CGrid
S
te
p
Hacks
,
typename
AGridMoveSliceWindow
S
te
p
Hacks
,
typename
BGridMoveSliceWindow
S
te
p
Hacks
>
__host__
float
driver_contraction_dlops_v1r2
(
const
FloatAB
*
p_a_grid
,
const
FloatAB
*
p_b_grid
,
...
...
@@ -51,11 +51,11 @@ driver_contraction_dlops_v1r2(const FloatAB* p_a_grid,
const
AGridDesc_GK0_GM0_GM1_GK1
&
a_grid_desc_gk0_gm0_gm1_gk1
,
const
BGridDesc_GK0_GN0_GN1_GK1
&
b_grid_desc_gk0_gn0_gn1_gk1
,
const
CGridDesc_GM0_GM1_GN0_GN1
&
c_grid_desc_gm0_gm1_gn0_gn1
,
AGrid
I
te
rator
Hacks
,
BGrid
I
te
rator
Hacks
,
CGrid
I
te
rator
Hacks
,
AGridMoveSliceWindow
I
te
rator
Hacks
,
BGridMoveSliceWindow
I
te
rator
Hacks
,
AGrid
S
te
p
Hacks
,
BGrid
S
te
p
Hacks
,
CGrid
S
te
p
Hacks
,
AGridMoveSliceWindow
S
te
p
Hacks
,
BGridMoveSliceWindow
S
te
p
Hacks
,
ck
::
index_t
nrepeat
)
{
...
...
@@ -104,11 +104,11 @@ driver_contraction_dlops_v1r2(const FloatAB* p_a_grid,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferDstScalarPerVector
,
AGrid
I
te
rator
Hacks
,
BGrid
I
te
rator
Hacks
,
CGrid
I
te
rator
Hacks
,
AGridMoveSliceWindow
I
te
rator
Hacks
,
BGridMoveSliceWindow
I
te
rator
Hacks
>
;
AGrid
S
te
p
Hacks
,
BGrid
S
te
p
Hacks
,
CGrid
S
te
p
Hacks
,
AGridMoveSliceWindow
S
te
p
Hacks
,
BGridMoveSliceWindow
S
te
p
Hacks
>
;
const
auto
GK0
=
a_grid_desc_gk0_gm0_gm1_gk1
.
GetLength
(
I0
);
...
...
host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
View file @
4e57b30a
...
...
@@ -136,13 +136,13 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
}
// hack to control index calculation when iterating over a_k_m_global tensor
constexpr
auto
a_e_k_global_
i
te
rator
_hacks
=
constexpr
auto
a_e_k_global_
s
te
p
_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
>
{}));
constexpr
auto
a_e_k_global_move_slice_window_
i
te
rator
_hack
=
Sequence
<
0
,
0
,
0
>
{};
constexpr
auto
a_e_k_global_move_slice_window_
s
te
p
_hack
=
Sequence
<
0
,
0
,
0
>
{};
constexpr
auto
b_e_n_ho_wo_global_
i
te
rator
_hacks
=
constexpr
auto
b_e_n_ho_wo_global_
s
te
p
_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
...
...
@@ -152,12 +152,12 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}));
constexpr
auto
b_e_n_ho_wo_global_move_slice_window_
i
te
rator
_hack
=
constexpr
auto
b_e_n_ho_wo_global_move_slice_window_
s
te
p
_hack
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
>
{};
// hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
// hack for NKHW format
constexpr
auto
c_k_n_ho_wo_global_tensor_
i
te
rator
_hacks
=
constexpr
auto
c_k_n_ho_wo_global_tensor_
s
te
p
_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
1
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
...
...
@@ -202,11 +202,11 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
Sequence
<
0
,
2
,
3
,
1
>
,
0
,
CThreadTransferDstScalarPerVector_W
,
decltype
(
a_e_k_global_
i
te
rator
_hacks
),
decltype
(
b_e_n_ho_wo_global_
i
te
rator
_hacks
),
decltype
(
c_k_n_ho_wo_global_tensor_
i
te
rator
_hacks
),
decltype
(
a_e_k_global_move_slice_window_
i
te
rator
_hack
),
decltype
(
b_e_n_ho_wo_global_move_slice_window_
i
te
rator
_hack
)
>
;
decltype
(
a_e_k_global_
s
te
p
_hacks
),
decltype
(
b_e_n_ho_wo_global_
s
te
p
_hacks
),
decltype
(
c_k_n_ho_wo_global_tensor_
s
te
p
_hacks
),
decltype
(
a_e_k_global_move_slice_window_
s
te
p
_hack
),
decltype
(
b_e_n_ho_wo_global_move_slice_window_
s
te
p
_hack
)
>
;
const
auto
GridSize
=
(
K
/
KPerBlock
)
*
(
Ho
/
HoPerBlock
)
*
(
Wo
/
WoPerBlock
)
*
N
;
...
...
host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
View file @
4e57b30a
...
...
@@ -149,13 +149,13 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
}
// hack to control index calculation when iterating over a_k_m_global tensor
constexpr
auto
a_e_k_global_
i
te
rator
_hacks
=
constexpr
auto
a_e_k_global_
s
te
p
_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
>
{}));
constexpr
auto
a_e_k_global_move_slice_window_
i
te
rator
_hack
=
Sequence
<
0
,
0
,
0
>
{};
constexpr
auto
a_e_k_global_move_slice_window_
s
te
p
_hack
=
Sequence
<
0
,
0
,
0
>
{};
constexpr
auto
b_e_n_ho_wo_global_
i
te
rator
_hacks
=
constexpr
auto
b_e_n_ho_wo_global_
s
te
p
_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
...
...
@@ -165,12 +165,12 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}));
constexpr
auto
b_e_n_ho_wo_global_move_slice_window_
i
te
rator
_hack
=
constexpr
auto
b_e_n_ho_wo_global_move_slice_window_
s
te
p
_hack
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
>
{};
// hack to control index calculation when iterating over c_m0_m1_n0_n1_global tensor
// hack for NKHW format
constexpr
auto
c_k_n_ho_wo_global_tensor_
i
te
rator
_hacks
=
constexpr
auto
c_k_n_ho_wo_global_tensor_
s
te
p
_hacks
=
make_tuple
(
make_tuple
(
Sequence
<
0
,
1
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
...
...
@@ -214,11 +214,11 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
Sequence
<
0
,
2
,
3
,
1
>
,
0
,
CThreadTransferDstScalarPerVector_W
,
decltype
(
a_e_k_global_
i
te
rator
_hacks
),
decltype
(
b_e_n_ho_wo_global_
i
te
rator
_hacks
),
decltype
(
c_k_n_ho_wo_global_tensor_
i
te
rator
_hacks
),
decltype
(
a_e_k_global_move_slice_window_
i
te
rator
_hack
),
decltype
(
b_e_n_ho_wo_global_move_slice_window_
i
te
rator
_hack
)
>
;
decltype
(
a_e_k_global_
s
te
p
_hacks
),
decltype
(
b_e_n_ho_wo_global_
s
te
p
_hacks
),
decltype
(
c_k_n_ho_wo_global_tensor_
s
te
p
_hacks
),
decltype
(
a_e_k_global_move_slice_window_
s
te
p
_hack
),
decltype
(
b_e_n_ho_wo_global_move_slice_window_
s
te
p
_hack
)
>
;
const
auto
GridSize
=
(
K
/
KPerBlock
)
*
(
Hop
/
HoPerBlock
)
*
(
Wop
/
WoPerBlock
)
*
N
;
...
...
host/driver_offline/include/driver_gemm_dlops_v1r2.hpp
View file @
4e57b30a
...
...
@@ -43,22 +43,22 @@ template <ck::index_t BlockSize,
typename
CThreadTransferSrcDstAccessOrder
,
ck
::
index_t
CThreadTransferSrcDstVectorDim
,
ck
::
index_t
CThreadTransferDstScalarPerVector
,
typename
AGrid
I
te
rator
Hacks
,
typename
BGrid
I
te
rator
Hacks
,
typename
CGrid
I
te
rator
Hacks
,
typename
AGridMoveSliceWindow
I
te
rator
Hacks
,
typename
BGridMoveSliceWindow
I
te
rator
Hacks
>
typename
AGrid
S
te
p
Hacks
,
typename
BGrid
S
te
p
Hacks
,
typename
CGrid
S
te
p
Hacks
,
typename
AGridMoveSliceWindow
S
te
p
Hacks
,
typename
BGridMoveSliceWindow
S
te
p
Hacks
>
__host__
float
driver_gemm_dlops_v1r2
(
const
FloatAB
*
p_a_grid
,
const
FloatAB
*
p_b_grid
,
FloatC
*
p_c_grid
,
const
AKMGridDesc
&
a_k_m_grid_desc
,
const
BKNGridDesc
&
b_k_n_grid_desc
,
const
CMNGridDesc
&
c_m_n_grid_desc
,
AGrid
I
te
rator
Hacks
,
BGrid
I
te
rator
Hacks
,
CGrid
I
te
rator
Hacks
,
AGridMoveSliceWindow
I
te
rator
Hacks
,
BGridMoveSliceWindow
I
te
rator
Hacks
,
AGrid
S
te
p
Hacks
,
BGrid
S
te
p
Hacks
,
CGrid
S
te
p
Hacks
,
AGridMoveSliceWindow
S
te
p
Hacks
,
BGridMoveSliceWindow
S
te
p
Hacks
,
ck
::
index_t
nrepeat
)
{
...
...
@@ -109,11 +109,11 @@ __host__ float driver_gemm_dlops_v1r2(const FloatAB* p_a_grid,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferDstScalarPerVector
,
AGrid
I
te
rator
Hacks
,
BGrid
I
te
rator
Hacks
,
CGrid
I
te
rator
Hacks
,
AGridMoveSliceWindow
I
te
rator
Hacks
,
BGridMoveSliceWindow
I
te
rator
Hacks
>
;
AGrid
S
te
p
Hacks
,
BGrid
S
te
p
Hacks
,
CGrid
S
te
p
Hacks
,
AGridMoveSliceWindow
S
te
p
Hacks
,
BGridMoveSliceWindow
S
te
p
Hacks
>
;
const
auto
M
=
a_k_m_grid_desc
.
GetLength
(
I1
);
const
auto
N
=
b_k_n_grid_desc
.
GetLength
(
I1
);
...
...
host/driver_offline/include/driver_gemm_dlops_v1r3.hpp
View file @
4e57b30a
...
...
@@ -39,22 +39,22 @@ template <ck::index_t BlockSize,
typename
CThreadTransferSrcDstAccessOrder
,
ck
::
index_t
CThreadTransferSrcDstVectorDim
,
ck
::
index_t
CThreadTransferDstScalarPerVector
,
typename
AGrid
I
te
rator
Hacks
,
typename
BGrid
I
te
rator
Hacks
,
typename
CGrid
I
te
rator
Hacks
,
typename
AGridMoveSliceWindow
I
te
rator
Hacks
,
typename
BGridMoveSliceWindow
I
te
rator
Hacks
>
typename
AGrid
S
te
p
Hacks
,
typename
BGrid
S
te
p
Hacks
,
typename
CGrid
S
te
p
Hacks
,
typename
AGridMoveSliceWindow
S
te
p
Hacks
,
typename
BGridMoveSliceWindow
S
te
p
Hacks
>
__host__
float
driver_gemm_dlops_v1r3
(
const
FloatAB
*
p_a_grid
,
const
FloatAB
*
p_b_grid
,
FloatC
*
p_c_grid
,
const
AK0MK1GridDesc
&
a_k0_m_k1_grid_desc
,
const
BK0NK1GridDesc
&
b_k0_n_k1_grid_desc
,
const
CMNGridDesc
&
c_m_n_grid_desc
,
AGrid
I
te
rator
Hacks
,
BGrid
I
te
rator
Hacks
,
CGrid
I
te
rator
Hacks
,
AGridMoveSliceWindow
I
te
rator
Hacks
,
BGridMoveSliceWindow
I
te
rator
Hacks
,
AGrid
S
te
p
Hacks
,
BGrid
S
te
p
Hacks
,
CGrid
S
te
p
Hacks
,
AGridMoveSliceWindow
S
te
p
Hacks
,
BGridMoveSliceWindow
S
te
p
Hacks
,
ck
::
index_t
nrepeat
)
{
...
...
@@ -102,11 +102,11 @@ __host__ float driver_gemm_dlops_v1r3(const FloatAB* p_a_grid,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferDstScalarPerVector
,
AGrid
I
te
rator
Hacks
,
BGrid
I
te
rator
Hacks
,
CGrid
I
te
rator
Hacks
,
AGridMoveSliceWindow
I
te
rator
Hacks
,
BGridMoveSliceWindow
I
te
rator
Hacks
>
;
AGrid
S
te
p
Hacks
,
BGrid
S
te
p
Hacks
,
CGrid
S
te
p
Hacks
,
AGridMoveSliceWindow
S
te
p
Hacks
,
BGridMoveSliceWindow
S
te
p
Hacks
>
;
const
auto
M
=
a_k0_m_k1_grid_desc
.
GetLength
(
I1
);
const
auto
N
=
b_k0_n_k1_grid_desc
.
GetLength
(
I1
);
...
...
host/driver_offline/include/driver_gemm_xdlops_v2r3.hpp
View file @
4e57b30a
...
...
@@ -41,11 +41,11 @@ template <ck::index_t BlockSize,
typename
CThreadTransferSrcDstAccessOrder
,
ck
::
index_t
CThreadTransferSrcDstVectorDim
,
ck
::
index_t
CThreadTransferDstScalarPerVector
,
typename
AGrid
I
te
rator
Hacks
,
typename
BGrid
I
te
rator
Hacks
,
typename
CGrid
I
te
rator
Hacks
,
typename
AGridMoveSliceWindow
I
te
rator
Hacks
,
typename
BGridMoveSliceWindow
I
te
rator
Hacks
,
typename
AGrid
S
te
p
Hacks
,
typename
BGrid
S
te
p
Hacks
,
typename
CGrid
S
te
p
Hacks
,
typename
AGridMoveSliceWindow
S
te
p
Hacks
,
typename
BGridMoveSliceWindow
S
te
p
Hacks
,
bool
CAccessOrderMRepeatNRepeat
>
__host__
float
driver_gemm_xdlops_v2r3
(
const
FloatAB
*
p_a_grid
,
const
FloatAB
*
p_b_grid
,
...
...
@@ -53,11 +53,11 @@ __host__ float driver_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
const
AK0MK1GridDesc
&
a_k0_m_k1_grid_desc
,
const
BK0NK1GridDesc
&
b_k0_n_k1_grid_desc
,
const
CMNGridDesc
&
c_m_n_grid_desc
,
AGrid
I
te
rator
Hacks
,
BGrid
I
te
rator
Hacks
,
CGrid
I
te
rator
Hacks
,
AGridMoveSliceWindow
I
te
rator
Hacks
,
BGridMoveSliceWindow
I
te
rator
Hacks
,
AGrid
S
te
p
Hacks
,
BGrid
S
te
p
Hacks
,
CGrid
S
te
p
Hacks
,
AGridMoveSliceWindow
S
te
p
Hacks
,
BGridMoveSliceWindow
S
te
p
Hacks
,
ck
::
index_t
nrepeat
)
{
...
...
@@ -103,11 +103,11 @@ __host__ float driver_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferDstScalarPerVector
,
AGrid
I
te
rator
Hacks
,
BGrid
I
te
rator
Hacks
,
CGrid
I
te
rator
Hacks
,
AGridMoveSliceWindow
I
te
rator
Hacks
,
BGridMoveSliceWindow
I
te
rator
Hacks
,
AGrid
S
te
p
Hacks
,
BGrid
S
te
p
Hacks
,
CGrid
S
te
p
Hacks
,
AGridMoveSliceWindow
S
te
p
Hacks
,
BGridMoveSliceWindow
S
te
p
Hacks
,
CAccessOrderMRepeatNRepeat
>
;
{
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment