Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
c03045ce
Commit
c03045ce
authored
Aug 10, 2021
by
Chao Liu
Browse files
rename
parent
b2589957
Changes
54
Show whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
518 additions
and
539 deletions
+518
-539
host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp
...on_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp
+6
-9
host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp
...on_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp
+6
-9
host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
...on_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
+6
-9
host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
...ution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+6
-8
host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
...ution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
+6
-9
host/driver_offline/include/driver_contraction_dlops_v1r2.hpp
.../driver_offline/include/driver_contraction_dlops_v1r2.hpp
+23
-23
host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
...ution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+16
-16
host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
...orward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
+16
-16
host/driver_offline/include/driver_gemm_dlops_v1r2.hpp
host/driver_offline/include/driver_gemm_dlops_v1r2.hpp
+132
-134
host/driver_offline/include/driver_gemm_dlops_v1r3.hpp
host/driver_offline/include/driver_gemm_dlops_v1r3.hpp
+128
-129
host/driver_offline/include/driver_gemm_xdlops_v2r3.hpp
host/driver_offline/include/driver_gemm_xdlops_v2r3.hpp
+66
-66
host/driver_offline/src/conv_bwd_driver_offline.cpp
host/driver_offline/src/conv_bwd_driver_offline.cpp
+33
-33
host/driver_offline/src/conv_fwd_driver_offline.cpp
host/driver_offline/src/conv_fwd_driver_offline.cpp
+70
-74
host/host_tensor/include/conv_common.hpp
host/host_tensor/include/conv_common.hpp
+4
-4
No files found.
host/driver_offline/include/device_
dynamic_
convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp
→
host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk.hpp
View file @
c03045ce
...
...
@@ -2,7 +2,7 @@
#include "device.hpp"
#include "host_tensor.hpp"
#include "transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp"
#include "driver_
dynamic_
gemm_xdlops_v2r2.hpp"
#include "driver_gemm_xdlops_v2r2.hpp"
template
<
typename
TInWei
,
typename
TAcc
,
...
...
@@ -14,7 +14,7 @@ template <typename TInWei,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
device_
dynamic_
convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk
(
void
device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nhwk
(
const
InLengths
&
in_n_hi_wi_c_lengths
,
const
WeiLengths
&
wei_k_y_x_c_lengths
,
const
OutLengths
&
out_n_ho_wo_k_lengths
,
...
...
@@ -44,12 +44,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nh
wei_k_y_x_c_device_buf
.
ToDevice
(
wei_k_y_x_c
.
mData
.
data
());
out_n_ho_wo_k_device_buf
.
ToDevice
(
out_n_ho_wo_k
.
mData
.
data
());
const
auto
in_n_hi_wi_c_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
in_n_hi_wi_c_lengths
);
const
auto
wei_k_y_x_c_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
wei_k_y_x_c_lengths
);
const
auto
out_n_ho_wo_k_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
out_n_ho_wo_k_lengths
);
const
auto
in_n_hi_wi_c_desc
=
make_naive_tensor_descriptor_packed
(
in_n_hi_wi_c_lengths
);
const
auto
wei_k_y_x_c_desc
=
make_naive_tensor_descriptor_packed
(
wei_k_y_x_c_lengths
);
const
auto
out_n_ho_wo_k_desc
=
make_naive_tensor_descriptor_packed
(
out_n_ho_wo_k_lengths
);
#if 1
// [M, N, K0, K1] = [256, 128, 4, 4] for fp32
...
...
@@ -155,7 +152,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nhwc_kyxc_nh
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
float
ave_time
=
driver_
dynamic_
gemm_xdlops_v2r2
<
float
ave_time
=
driver_gemm_xdlops_v2r2
<
BlockSize
,
TInWei
,
TAcc
,
...
...
host/driver_offline/include/device_
dynamic_
convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp
→
host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk.hpp
View file @
c03045ce
...
...
@@ -2,7 +2,7 @@
#include "device.hpp"
#include "host_tensor.hpp"
#include "transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp"
#include "driver_
dynamic_
gemm_xdlops_v2r3.hpp"
#include "driver_gemm_xdlops_v2r3.hpp"
template
<
typename
TInWei
,
typename
TAcc
,
...
...
@@ -14,7 +14,7 @@ template <typename TInWei,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
device_
dynamic_
convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk
(
void
device_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nhwk
(
const
InLengths
&
in_n_hi_wi_c_lengths
,
const
WeiLengths
&
wei_k_y_x_c_lengths
,
const
OutLengths
&
out_n_ho_wo_k_lengths
,
...
...
@@ -49,12 +49,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nh
wei_k_y_x_c_device_buf
.
ToDevice
(
wei_k_y_x_c
.
mData
.
data
());
out_n_ho_wo_k_device_buf
.
ToDevice
(
out_n_ho_wo_k
.
mData
.
data
());
const
auto
in_n_hi_wi_c_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
in_n_hi_wi_c_lengths
);
const
auto
wei_k_y_x_c_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
wei_k_y_x_c_lengths
);
const
auto
out_n_ho_wo_k_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
out_n_ho_wo_k_lengths
);
const
auto
in_n_hi_wi_c_desc
=
make_naive_tensor_descriptor_packed
(
in_n_hi_wi_c_lengths
);
const
auto
wei_k_y_x_c_desc
=
make_naive_tensor_descriptor_packed
(
wei_k_y_x_c_lengths
);
const
auto
out_n_ho_wo_k_desc
=
make_naive_tensor_descriptor_packed
(
out_n_ho_wo_k_lengths
);
#if 1
// [M, N, K0, K1] = [256, 128, 4, 4] for fp32
...
...
@@ -224,7 +221,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r3_xdlops_nhwc_kyxc_nh
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
float
ave_time
=
driver_
dynamic_
gemm_xdlops_v2r3
<
float
ave_time
=
driver_gemm_xdlops_v2r3
<
BlockSize
,
TInWei
,
TAcc
,
...
...
host/driver_offline/include/device_
dynamic_
convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
→
host/driver_offline/include/device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp
View file @
c03045ce
...
...
@@ -2,7 +2,7 @@
#include "device.hpp"
#include "host_tensor.hpp"
#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
#include "driver_
dynamic_
gemm_xdlops_v2r3.hpp"
#include "driver_gemm_xdlops_v2r3.hpp"
template
<
typename
TInWei
,
typename
TAcc
,
...
...
@@ -14,7 +14,7 @@ template <typename TInWei,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
device_
dynamic_
convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk
(
void
device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk
(
const
InLengths
&
in_n_hi_wi_c_lengths
,
const
WeiLengths
&
wei_k_y_x_c_lengths
,
const
OutLengths
&
out_n_ho_wo_k_lengths
,
...
...
@@ -44,12 +44,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nh
wei_k_y_x_c_device_buf
.
ToDevice
(
wei_k_y_x_c
.
mData
.
data
());
out_n_ho_wo_k_device_buf
.
ToDevice
(
out_n_ho_wo_k
.
mData
.
data
());
const
auto
in_n_hi_wi_c_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
in_n_hi_wi_c_lengths
);
const
auto
wei_k_y_x_c_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
wei_k_y_x_c_lengths
);
const
auto
out_n_ho_wo_k_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
out_n_ho_wo_k_lengths
);
const
auto
in_n_hi_wi_c_desc
=
make_naive_tensor_descriptor_packed
(
in_n_hi_wi_c_lengths
);
const
auto
wei_k_y_x_c_desc
=
make_naive_tensor_descriptor_packed
(
wei_k_y_x_c_lengths
);
const
auto
out_n_ho_wo_k_desc
=
make_naive_tensor_descriptor_packed
(
out_n_ho_wo_k_lengths
);
#if 0
// [M, N, K0, K1] = [256, 128, 4, 4] for fp32
...
...
@@ -278,7 +275,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nh
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
float
ave_time
=
driver_
dynamic_
gemm_xdlops_v2r3
<
float
ave_time
=
driver_gemm_xdlops_v2r3
<
BlockSize
,
TInWei
,
TAcc
,
...
...
host/driver_offline/include/device_
dynamic_
convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
→
host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
View file @
c03045ce
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "driver_
dynamic_
convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp"
#include "driver_
dynamic_
convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp"
#include "driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp"
#include "driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp"
template
<
typename
TInWei
,
ck
::
index_t
InWeiVectorSize
,
...
...
@@ -15,7 +15,7 @@ template <typename TInWei,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
device_
dynamic_
convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw
(
void
device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw
(
const
InLengths
&
in_n_c_hi_wi_lengths
,
const
WeiLengths
&
wei_k_c_y_x_lengths
,
const
OutLengths
&
out_n_k_ho_wo_lengths
,
...
...
@@ -85,12 +85,10 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
in_n_c0_hi_wi_c1_device_buf
.
ToDevice
(
in_n_c0_hi_wi_c1
.
mData
.
data
());
wei_k_c0_y_x_c1_device_buf
.
ToDevice
(
wei_k_c0_y_x_c1
.
mData
.
data
());
const
auto
in_n_c0_hi_wi_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
N
,
C0
,
Hi
,
Wi
));
const
auto
wei_k_c0_y_x_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
K
,
C0
,
Y
,
X
));
const
auto
in_n_c0_hi_wi_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
C0
,
Hi
,
Wi
));
const
auto
wei_k_c0_y_x_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
K
,
C0
,
Y
,
X
));
const
auto
out_n_k0_ho_wo_k1_desc
=
make_
dynamic_
naive_tensor_descriptor_packed
_v2
(
make_tuple
(
N
,
K0
,
Ho
,
Wo
,
K1
));
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
K0
,
Ho
,
Wo
,
K1
));
#if 1
// cdata = 64, BlockSize = 64, 16x8x32x4
...
...
host/driver_offline/include/device_
dynamic_
convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
→
host/driver_offline/include/device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp
View file @
c03045ce
...
...
@@ -3,7 +3,7 @@
#include "device.hpp"
#include "host_tensor.hpp"
#include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp"
#include "driver_
dynamic_
contraction_dlops_v1r2.hpp"
#include "driver_contraction_dlops_v1r2.hpp"
template
<
typename
TInWei
,
typename
TAcc
,
...
...
@@ -15,7 +15,7 @@ template <typename TInWei,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
void
device_
dynamic_
convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw
(
void
device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw
(
const
InLengths
&
in_n_c_hi_wi_lengths
,
const
WeiLengths
&
wei_k_c_y_x_lengths
,
const
OutLengths
&
out_n_k_ho_wo_lengths
,
...
...
@@ -44,12 +44,9 @@ void device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
wei_k_c_y_x_device_buf
.
ToDevice
(
wei_k_c_y_x
.
mData
.
data
());
out_n_k_ho_wo_device_buf
.
ToDevice
(
out_n_k_ho_wo
.
mData
.
data
());
const
auto
in_desc_n_c_hi_wi
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
in_n_c_hi_wi_lengths
);
const
auto
wei_desc_k_c_y_x
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
wei_k_c_y_x_lengths
);
const
auto
out_desc_n_k_ho_wo
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
out_n_k_ho_wo_lengths
);
const
auto
in_desc_n_c_hi_wi
=
make_naive_tensor_descriptor_packed
(
in_n_c_hi_wi_lengths
);
const
auto
wei_desc_k_c_y_x
=
make_naive_tensor_descriptor_packed
(
wei_k_c_y_x_lengths
);
const
auto
out_desc_n_k_ho_wo
=
make_naive_tensor_descriptor_packed
(
out_n_k_ho_wo_lengths
);
#if 0
// [8, 1, 128, 1] * [8, 4, 32, 1] = [1, 128, 4, 32] for fp32
...
...
@@ -180,7 +177,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
float
ave_time
=
driver_
dynamic_
contraction_dlops_v1r2
<
float
ave_time
=
driver_contraction_dlops_v1r2
<
BlockSize
,
TInWei
,
TAcc
,
...
...
host/driver_offline/include/driver_
dynamic_
contraction_dlops_v1r2.hpp
→
host/driver_offline/include/driver_contraction_dlops_v1r2.hpp
View file @
c03045ce
#ifndef DRIVER_
DYNAMIC_
CONTRACTION_DLOPS_V1R2_HPP
#define DRIVER_
DYNAMIC_
CONTRACTION_DLOPS_V1R2_HPP
#ifndef DRIVER_CONTRACTION_DLOPS_V1R2_HPP
#define DRIVER_CONTRACTION_DLOPS_V1R2_HPP
#include "common_header.hpp"
#include "
dynamic_
tensor_descriptor.hpp"
#include "
dynamic_
tensor_descriptor_helper.hpp"
#include "gridwise_
dynamic_
contraction_dlops_v1r2.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_contraction_dlops_v1r2.hpp"
template
<
ck
::
index_t
BlockSize
,
typename
FloatAB
,
...
...
@@ -45,7 +45,7 @@ template <ck::index_t BlockSize,
typename
AGridMoveSliceWindowIteratorHacks
,
typename
BGridMoveSliceWindowIteratorHacks
>
__host__
float
driver_
dynamic_
contraction_dlops_v1r2
(
const
FloatAB
*
p_a_grid
,
driver_contraction_dlops_v1r2
(
const
FloatAB
*
p_a_grid
,
const
FloatAB
*
p_b_grid
,
FloatC
*
p_c_grid
,
const
AGridDesc_GK0_GM0_GM1_GK1
&
a_grid_desc_gk0_gm0_gm1_gk1
,
...
...
@@ -70,7 +70,7 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid,
// GEMM
using
GridwiseContraction
=
Gridwise
Dynamic
ContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1
<
GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1
<
BlockSize
,
FloatAB
,
FloatAcc
,
...
...
@@ -116,7 +116,7 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid,
a_grid_desc_gk0_gm0_gm1_gk1
,
b_grid_desc_gk0_gn0_gn1_gk1
,
c_grid_desc_gm0_gm1_gn0_gn1
))
{
throw
std
::
runtime_error
(
"wrong! "
"Gridwise
Dynamic
Contraction_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_"
"GridwiseContraction_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_"
"GM0_GM1_GN0_GN1 has invalid setting"
);
}
...
...
@@ -178,7 +178,7 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid,
if
(
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_
dynamic_
contraction_dlops_v1r2
<
const
auto
kernel
=
kernel_contraction_dlops_v1r2
<
GridwiseContraction
,
FloatAB
,
FloatC
,
...
...
@@ -204,7 +204,7 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid,
}
else
if
(
has_main_k_block_loop
&&
!
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_
dynamic_
contraction_dlops_v1r2
<
const
auto
kernel
=
kernel_contraction_dlops_v1r2
<
GridwiseContraction
,
FloatAB
,
FloatC
,
...
...
@@ -230,7 +230,7 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid,
}
else
if
(
!
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_
dynamic_
contraction_dlops_v1r2
<
const
auto
kernel
=
kernel_contraction_dlops_v1r2
<
GridwiseContraction
,
FloatAB
,
FloatC
,
...
...
@@ -256,7 +256,7 @@ driver_dynamic_contraction_dlops_v1r2(const FloatAB* p_a_grid,
}
else
{
const
auto
kernel
=
kernel_
dynamic_
contraction_dlops_v1r2
<
const
auto
kernel
=
kernel_contraction_dlops_v1r2
<
GridwiseContraction
,
FloatAB
,
FloatC
,
...
...
host/driver_offline/include/driver_
dynamic_
convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
→
host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
View file @
c03045ce
#ifndef DRIVER_
DYNAMIC_
CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_HPP
#define DRIVER_
DYNAMIC_
CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_HPP
#ifndef DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_HPP
#define DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_NCHW_KCYX_NKHW_HPP
#include "common_header.hpp"
#include "
dynamic_
tensor_descriptor.hpp"
#include "
dynamic_
tensor_descriptor_helper.hpp"
#include "gridwise_
dynamic_
gemm_dlops_v2.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_gemm_dlops_v2.hpp"
#include "gridwise_operation_wrapper.hpp"
template
<
ck
::
index_t
BlockSize
,
...
...
@@ -34,9 +34,9 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
__host__
void
Run
(
const
ck
::
Dynamic
TensorDescriptor
<
Wei
...
>&
wei_k_c_y_x_global_desc
,
const
ck
::
Dynamic
TensorDescriptor
<
In
...
>&
in_n_c_hi_wi_global_desc
,
const
ck
::
Dynamic
TensorDescriptor
<
Out
...
>&
out_n_k0_ho_wo_k1_global_desc
,
__host__
void
Run
(
const
ck
::
TensorDescriptor
<
Wei
...
>&
wei_k_c_y_x_global_desc
,
const
ck
::
TensorDescriptor
<
In
...
>&
in_n_c_hi_wi_global_desc
,
const
ck
::
TensorDescriptor
<
Out
...
>&
out_n_k0_ho_wo_k1_global_desc
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
...
...
@@ -82,14 +82,14 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
const
auto
InRightPadW
=
in_right_pads
[
I1
];
// weight tensor
const
auto
wei_e_k_global_desc
=
transform_
dynamic_
tensor_descriptor
(
make_
dynamic_
naive_tensor_descriptor_packed
_v2
(
make_tuple
(
K
,
C
*
Y
*
X
)),
const
auto
wei_e_k_global_desc
=
transform_tensor_descriptor
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
K
,
C
*
Y
*
X
)),
make_tuple
(
make_pass_through_transform
(
K
),
make_pass_through_transform
(
C
*
Y
*
X
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{}));
// input tensor
const
auto
in_n_c_hip_wip_global_desc
=
transform_
dynamic_
tensor_descriptor
(
const
auto
in_n_c_hip_wip_global_desc
=
transform_tensor_descriptor
(
in_n_c_hi_wi_global_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_pass_through_transform
(
C
),
...
...
@@ -98,7 +98,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
const
auto
in_n_c_y_ho_x_wo_global_desc
=
transform_
dynamic_
tensor_descriptor
(
const
auto
in_n_c_y_ho_x_wo_global_desc
=
transform_tensor_descriptor
(
in_n_c_hip_wip_global_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
...
...
@@ -108,7 +108,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{}));
const
auto
in_e_n_ho_wo_global_desc
=
transform_
dynamic_
tensor_descriptor
(
const
auto
in_e_n_ho_wo_global_desc
=
transform_tensor_descriptor
(
in_n_c_y_ho_x_wo_global_desc
,
make_tuple
(
make_merge_transform
(
make_tuple
(
C
,
Y
,
X
)),
make_pass_through_transform
(
N
),
...
...
@@ -118,8 +118,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
// output tensor
const
auto
out_k_n_ho_wo_global_desc
=
transform_
dynamic_
tensor_descriptor
(
make_
dynamic_
naive_tensor_descriptor_packed
_v2
(
make_tuple
(
N
,
K0
,
Ho
,
Wo
,
K1
)),
const
auto
out_k_n_ho_wo_global_desc
=
transform_tensor_descriptor
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
K0
,
Ho
,
Wo
,
K1
)),
make_tuple
(
make_merge_transform
(
make_tuple
(
K0
,
K1
)),
make_pass_through_transform
(
N
),
make_pass_through_transform
(
Ho
),
...
...
@@ -169,7 +169,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_pad
#if 1
// GEMM
using
gridwise_gemm
=
Gridwise
Dynamic
GemmDlops_km_kn_mn_v3
<
using
gridwise_gemm
=
GridwiseGemmDlops_km_kn_mn_v3
<
BlockSize
,
FloatAB
,
FloatAcc
,
...
...
host/driver_offline/include/driver_
dynamic_
convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
→
host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw_outpad.hpp
View file @
c03045ce
#ifndef DRIVER_
DYNAMIC_
CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NCHW_KCYX_NKHW_OUTPAD_HPP
#define DRIVER_
DYNAMIC_
CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NCHW_KCYX_NKHW_OUTPAD_HPP
#ifndef DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NCHW_KCYX_NKHW_OUTPAD_HPP
#define DRIVER_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V5R1_DLOPS_NCHW_KCYX_NKHW_OUTPAD_HPP
#include "common_header.hpp"
#include "
dynamic_
tensor_descriptor.hpp"
#include "
dynamic_
tensor_descriptor_helper.hpp"
#include "gridwise_
dynamic_
gemm_dlops_v2.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_gemm_dlops_v2.hpp"
#include "gridwise_operation_wrapper.hpp"
template
<
ck
::
index_t
BlockSize
,
...
...
@@ -34,9 +34,9 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
>
__host__
void
Run
(
const
ck
::
Dynamic
TensorDescriptor
<
Wei
...
>&
wei_k_c_y_x_global_desc
,
const
ck
::
Dynamic
TensorDescriptor
<
In
...
>&
in_n_c_hi_wi_global_desc
,
const
ck
::
Dynamic
TensorDescriptor
<
Out
...
>&
out_n_k0_ho_wo_k1_global_desc
,
__host__
void
Run
(
const
ck
::
TensorDescriptor
<
Wei
...
>&
wei_k_c_y_x_global_desc
,
const
ck
::
TensorDescriptor
<
In
...
>&
in_n_c_hi_wi_global_desc
,
const
ck
::
TensorDescriptor
<
Out
...
>&
out_n_k0_ho_wo_k1_global_desc
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
...
...
@@ -93,14 +93,14 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
<<
std
::
endl
;
// weight tensor
const
auto
wei_e_k_global_desc
=
transform_
dynamic_
tensor_descriptor
(
make_
dynamic_
naive_tensor_descriptor_packed
_v2
(
make_tuple
(
K
,
C
*
Y
*
X
)),
const
auto
wei_e_k_global_desc
=
transform_tensor_descriptor
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
K
,
C
*
Y
*
X
)),
make_tuple
(
make_pass_through_transform
(
K
),
make_pass_through_transform
(
C
*
Y
*
X
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{}));
// input tensor
const
auto
in_n_c_hip_wip_global_desc
=
transform_
dynamic_
tensor_descriptor
(
const
auto
in_n_c_hip_wip_global_desc
=
transform_tensor_descriptor
(
in_n_c_hi_wi_global_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_pass_through_transform
(
C
),
...
...
@@ -109,7 +109,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
const
auto
in_n_c_y_ho_x_wo_global_desc
=
transform_
dynamic_
tensor_descriptor
(
const
auto
in_n_c_y_ho_x_wo_global_desc
=
transform_tensor_descriptor
(
in_n_c_hip_wip_global_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
...
...
@@ -119,7 +119,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{}));
const
auto
in_e_n_ho_wo_global_desc
=
transform_
dynamic_
tensor_descriptor
(
const
auto
in_e_n_ho_wo_global_desc
=
transform_tensor_descriptor
(
in_n_c_y_ho_x_wo_global_desc
,
make_tuple
(
make_merge_transform
(
make_tuple
(
C
,
Y
,
X
)),
make_pass_through_transform
(
N
),
...
...
@@ -129,8 +129,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
// output tensor
const
auto
out_k_n_hop_wop_global_desc
=
transform_
dynamic_
tensor_descriptor
(
make_
dynamic_
naive_tensor_descriptor_packed
_v2
(
make_tuple
(
N
,
K0
,
Ho
,
Wo
,
K1
)),
const
auto
out_k_n_hop_wop_global_desc
=
transform_tensor_descriptor
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
K0
,
Ho
,
Wo
,
K1
)),
make_tuple
(
make_merge_transform
(
make_tuple
(
K0
,
K1
)),
make_pass_through_transform
(
N
),
make_pad_transform
(
Ho
,
0
,
OutRightPadH
),
...
...
@@ -181,7 +181,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}));
// GEMM
using
gridwise_gemm
=
Gridwise
Dynamic
GemmDlops_km_kn_mn_v3
<
using
gridwise_gemm
=
GridwiseGemmDlops_km_kn_mn_v3
<
BlockSize
,
FloatAB
,
FloatAcc
,
...
...
host/driver_offline/include/driver_
dynamic_
gemm_dlops_v1r2.hpp
→
host/driver_offline/include/driver_gemm_dlops_v1r2.hpp
View file @
c03045ce
#ifndef DRIVER_
DYNAMIC_
GEMM_DLOPS_V1R2
#define DRIVER_
DYNAMIC_
GEMM_DLOPS_V1R2
#ifndef DRIVER_GEMM_DLOPS_V1R2
#define DRIVER_GEMM_DLOPS_V1R2
#include "common_header.hpp"
#include "
dynamic_
tensor_descriptor.hpp"
#include "
dynamic_
tensor_descriptor_helper.hpp"
#include "gridwise_
dynamic_
gemm_dlops_v1r2.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_gemm_dlops_v1r2.hpp"
template
<
ck
::
index_t
BlockSize
,
typename
FloatAB
,
...
...
@@ -48,7 +48,7 @@ template <ck::index_t BlockSize,
typename
CGridIteratorHacks
,
typename
AGridMoveSliceWindowIteratorHacks
,
typename
BGridMoveSliceWindowIteratorHacks
>
__host__
float
driver_
dynamic_
gemm_dlops_v1r2
(
const
FloatAB
*
p_a_grid
,
__host__
float
driver_gemm_dlops_v1r2
(
const
FloatAB
*
p_a_grid
,
const
FloatAB
*
p_b_grid
,
FloatC
*
p_c_grid
,
const
AKMGridDesc
&
a_k_m_grid_desc
,
...
...
@@ -72,8 +72,7 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
constexpr
auto
I5
=
Number
<
5
>
{};
// GEMM
using
GridwiseGemm
=
GridwiseDynamicGemmDlops_km_kn_mn_v1r2
<
BlockSize
,
using
GridwiseGemm
=
GridwiseGemmDlops_km_kn_mn_v1r2
<
BlockSize
,
FloatAB
,
FloatAcc
,
FloatC
,
...
...
@@ -122,8 +121,7 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
if
(
!
GridwiseGemm
::
CheckValidity
(
a_k_m_grid_desc
,
b_k_n_grid_desc
,
c_m_n_grid_desc
))
{
throw
std
::
runtime_error
(
"wrong! GridwiseDynamicGemmDlops_km_kn_mn_v1r2 has invalid setting"
);
throw
std
::
runtime_error
(
"wrong! GridwiseGemmDlops_km_kn_mn_v1r2 has invalid setting"
);
}
const
auto
a_k_m0_m1_grid_desc
=
GridwiseGemm
::
MakeAKM0M1GridDescriptor
(
a_k_m_grid_desc
);
...
...
@@ -174,7 +172,7 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
if
(
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_
dynamic_
gemm_dlops_v1r2
<
GridwiseGemm
,
kernel_gemm_dlops_v1r2
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AKM0M1GridDesc
>
,
...
...
@@ -200,7 +198,7 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
else
if
(
has_main_k_block_loop
&&
!
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_
dynamic_
gemm_dlops_v1r2
<
GridwiseGemm
,
kernel_gemm_dlops_v1r2
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AKM0M1GridDesc
>
,
...
...
@@ -226,7 +224,7 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
else
if
(
!
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_
dynamic_
gemm_dlops_v1r2
<
GridwiseGemm
,
kernel_gemm_dlops_v1r2
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AKM0M1GridDesc
>
,
...
...
@@ -252,7 +250,7 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
else
{
const
auto
kernel
=
kernel_
dynamic_
gemm_dlops_v1r2
<
GridwiseGemm
,
kernel_gemm_dlops_v1r2
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AKM0M1GridDesc
>
,
...
...
@@ -295,7 +293,7 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
if
(
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_
dynamic_
gemm_dlops_v1r2
<
GridwiseGemm
,
kernel_gemm_dlops_v1r2
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AKM0M1GridDesc
>
,
...
...
@@ -324,7 +322,7 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
else
if
(
has_main_k_block_loop
&&
!
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_
dynamic_
gemm_dlops_v1r2
<
GridwiseGemm
,
kernel_gemm_dlops_v1r2
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AKM0M1GridDesc
>
,
...
...
@@ -353,7 +351,7 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
else
if
(
!
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_
dynamic_
gemm_dlops_v1r2
<
GridwiseGemm
,
kernel_gemm_dlops_v1r2
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AKM0M1GridDesc
>
,
...
...
@@ -382,7 +380,7 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
else
{
const
auto
kernel
=
kernel_
dynamic_
gemm_dlops_v1r2
<
GridwiseGemm
,
kernel_gemm_dlops_v1r2
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AKM0M1GridDesc
>
,
...
...
host/driver_offline/include/driver_
dynamic_
gemm_dlops_v1r3.hpp
→
host/driver_offline/include/driver_gemm_dlops_v1r3.hpp
View file @
c03045ce
#ifndef DRIVER_
DYNAMIC_
GEMM_DLOPS_V1R3
#define DRIVER_
DYNAMIC_
GEMM_DLOPS_V1R3
#ifndef DRIVER_GEMM_DLOPS_V1R3
#define DRIVER_GEMM_DLOPS_V1R3
#include "common_header.hpp"
#include "
dynamic_
tensor_descriptor.hpp"
#include "
dynamic_
tensor_descriptor_helper.hpp"
#include "gridwise_
dynamic_
gemm_dlops_v1r3.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_gemm_dlops_v1r3.hpp"
template
<
ck
::
index_t
BlockSize
,
typename
FloatAB
,
...
...
@@ -44,7 +44,7 @@ template <ck::index_t BlockSize,
typename
CGridIteratorHacks
,
typename
AGridMoveSliceWindowIteratorHacks
,
typename
BGridMoveSliceWindowIteratorHacks
>
__host__
float
driver_
dynamic_
gemm_dlops_v1r3
(
const
FloatAB
*
p_a_grid
,
__host__
float
driver_gemm_dlops_v1r3
(
const
FloatAB
*
p_a_grid
,
const
FloatAB
*
p_b_grid
,
FloatC
*
p_c_grid
,
const
AK0MK1GridDesc
&
a_k0_m_k1_grid_desc
,
...
...
@@ -69,7 +69,7 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
// GEMM
using
GridwiseGemm
=
Gridwise
Dynamic
GemmDlops_km_kn_mn_v1r3
<
BlockSize
,
GridwiseGemmDlops_km_kn_mn_v1r3
<
BlockSize
,
FloatAB
,
FloatAcc
,
FloatC
,
...
...
@@ -114,8 +114,7 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
if
(
!
GridwiseGemm
::
CheckValidity
(
a_k0_m_k1_grid_desc
,
b_k0_n_k1_grid_desc
,
c_m_n_grid_desc
))
{
throw
std
::
runtime_error
(
"wrong! GridwiseDynamicGemmDlops_km_kn_mn_v1r3 has invalid setting"
);
throw
std
::
runtime_error
(
"wrong! GridwiseGemmDlops_km_kn_mn_v1r3 has invalid setting"
);
}
const
auto
a_k0_m0_m1_k1_grid_desc
=
...
...
@@ -170,7 +169,7 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
if
(
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_
dynamic_
gemm_dlops_v1r3
<
GridwiseGemm
,
kernel_gemm_dlops_v1r3
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AK0M0M1K1GridDesc
>
,
...
...
@@ -196,7 +195,7 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
else
if
(
has_main_k_block_loop
&&
!
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_
dynamic_
gemm_dlops_v1r3
<
GridwiseGemm
,
kernel_gemm_dlops_v1r3
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AK0M0M1K1GridDesc
>
,
...
...
@@ -222,7 +221,7 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
else
if
(
!
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_
dynamic_
gemm_dlops_v1r3
<
GridwiseGemm
,
kernel_gemm_dlops_v1r3
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AK0M0M1K1GridDesc
>
,
...
...
@@ -248,7 +247,7 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
else
{
const
auto
kernel
=
kernel_
dynamic_
gemm_dlops_v1r3
<
GridwiseGemm
,
kernel_gemm_dlops_v1r3
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AK0M0M1K1GridDesc
>
,
...
...
@@ -291,7 +290,7 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
if
(
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_
dynamic_
gemm_dlops_v1r3
<
GridwiseGemm
,
kernel_gemm_dlops_v1r3
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AK0M0M1K1GridDesc
>
,
...
...
@@ -322,7 +321,7 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
else
if
(
has_main_k_block_loop
&&
!
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_
dynamic_
gemm_dlops_v1r3
<
GridwiseGemm
,
kernel_gemm_dlops_v1r3
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AK0M0M1K1GridDesc
>
,
...
...
@@ -353,7 +352,7 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
else
if
(
!
has_main_k_block_loop
&&
has_double_tail_k_block_loop
)
{
const
auto
kernel
=
kernel_
dynamic_
gemm_dlops_v1r3
<
GridwiseGemm
,
kernel_gemm_dlops_v1r3
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AK0M0M1K1GridDesc
>
,
...
...
@@ -384,7 +383,7 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
else
{
const
auto
kernel
=
kernel_
dynamic_
gemm_dlops_v1r3
<
GridwiseGemm
,
kernel_gemm_dlops_v1r3
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AK0M0M1K1GridDesc
>
,
...
...
host/driver_offline/include/driver_
dynamic_
gemm_xdlops_v2r3.hpp
→
host/driver_offline/include/driver_gemm_xdlops_v2r3.hpp
View file @
c03045ce
#ifndef DRIVER_
DYNAMIC_
GEMM_XDLOPS_V2R3
#define DRIVER_
DYNAMIC_
GEMM_XDLOPS_V2R3
#ifndef DRIVER_GEMM_XDLOPS_V2R3
#define DRIVER_GEMM_XDLOPS_V2R3
#include "common_header.hpp"
#include "
dynamic_
tensor_descriptor.hpp"
#include "
dynamic_
tensor_descriptor_helper.hpp"
#include "gridwise_
dynamic_
gemm_xdlops_v2r3.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_gemm_xdlops_v2r3.hpp"
template
<
ck
::
index_t
BlockSize
,
typename
FloatAB
,
...
...
@@ -47,7 +47,7 @@ template <ck::index_t BlockSize,
typename
AGridMoveSliceWindowIteratorHacks
,
typename
BGridMoveSliceWindowIteratorHacks
,
bool
CAccessOrderMRepeatNRepeat
>
__host__
float
driver_
dynamic_
gemm_xdlops_v2r3
(
const
FloatAB
*
p_a_grid
,
__host__
float
driver_gemm_xdlops_v2r3
(
const
FloatAB
*
p_a_grid
,
const
FloatAB
*
p_b_grid
,
FloatC
*
p_c_grid
,
const
AK0MK1GridDesc
&
a_k0_m_k1_grid_desc
,
...
...
@@ -68,7 +68,7 @@ __host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
constexpr
auto
I2
=
Number
<
2
>
{};
using
GridwiseGemm
=
Gridwise
Dynamic
Gemm_k0mk1_k0nk1_mn_xdlops_v2r3
<
BlockSize
,
GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
<
BlockSize
,
FloatAB
,
FloatAcc
,
FloatC
,
...
...
@@ -126,7 +126,7 @@ __host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
if
(
!
GridwiseGemm
::
CheckValidity
(
a_k0_m_k1_grid_desc
,
b_k0_n_k1_grid_desc
,
c_m_n_grid_desc
))
{
throw
std
::
runtime_error
(
"wrong! Gridwise
Dynamic
Gemm_km_kn_m0m1n0n1_xdlops_v2r3 has invalid setting"
);
"wrong! GridwiseGemm_km_kn_m0m1n0n1_xdlops_v2r3 has invalid setting"
);
}
const
auto
c_m0_m1_m2_n_grid_desc
=
GridwiseGemm
::
MakeCM0M1M2NGridDescriptor
(
c_m_n_grid_desc
);
...
...
@@ -139,7 +139,7 @@ __host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
const
index_t
grid_size
=
GridwiseGemm
::
CalculateGridSize
(
c_m_n_grid_desc
);
const
auto
kernel
=
kernel_
dynamic_
gemm_xdlops_v2r3
<
GridwiseGemm
,
const
auto
kernel
=
kernel_gemm_xdlops_v2r3
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AK0MK1GridDesc
>
,
...
...
host/driver_offline/src/conv_bwd_driver_offline.cpp
View file @
c03045ce
...
...
@@ -12,10 +12,10 @@
#include "conv_common.hpp"
#include "host_conv_bwd_data.hpp"
#include "device_tensor.hpp"
#include "device_
dynamic_
convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp"
#include "device_
dynamic_
convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp"
#include "device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp"
#include "device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp"
#define USE_
DYNAMIC_
MODE 1
#define USE_MODE 1
#define USE_CONV_BWD_V4R1_XDL_NHWC 1
#define USE_CONV_BWD_V4R1R2_XDL_NHWC 1
...
...
@@ -37,7 +37,7 @@ int main(int argc, char* argv[])
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
#if USE_
DYNAMIC_
MODE
#if USE_MODE
// dynamic mode
if
(
argc
!=
22
)
{
...
...
@@ -212,7 +212,7 @@ int main(int argc, char* argv[])
}
auto
f_make_for_device_nhwc
=
[
&
]()
{
#if USE_
DYNAMIC_
MODE
#if USE_MODE
const
auto
in_lengths_dev
=
make_tuple
(
N
,
Hi
,
Wi
,
C
);
const
auto
wei_lengths_dev
=
make_tuple
(
K
,
Y
,
X
,
C
);
const
auto
out_lengths_dev
=
make_tuple
(
N
,
Ho
,
Wo
,
K
);
...
...
@@ -253,10 +253,10 @@ int main(int argc, char* argv[])
const
auto
tmp
=
f_make_for_device_nhwc
();
device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk
<
in_data_t
,
device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
...
...
@@ -280,10 +280,10 @@ int main(int argc, char* argv[])
const
auto
tmp
=
f_make_for_device_nhwc
();
device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk
<
in_data_t
,
device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
...
...
host/driver_offline/src/conv_fwd_driver_offline.cpp
View file @
c03045ce
...
...
@@ -12,14 +12,14 @@
#include "conv_common.hpp"
#include "host_conv.hpp"
#include "device_tensor.hpp"
#include "device_
dynamic_
convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp"
#include "device_
dynamic_
convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp"
#include "device_
dynamic_
convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp"
#include "device_
dynamic_
convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp"
#include "device_
dynamic_
convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
#include "device_
dynamic_
convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
#define USE_
DYNAMIC_
MODE 1
#include "device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp"
#include "device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp"
#include "device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp"
#include "device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp"
#include "device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
#include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
#define USE_MODE 1
#define USE_CONV_FWD_V4R4_NCHW 0
#define USE_CONV_FWD_V4R4R2_NHWC 1
#define USE_CONV_FWD_V6R1_NCHW 1
...
...
@@ -49,7 +49,7 @@ int main(int argc, char* argv[])
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
#if USE_
DYNAMIC_
MODE
#if USE_MODE
// dynamic mode
if
(
argc
!=
22
)
{
...
...
@@ -228,7 +228,7 @@ int main(int argc, char* argv[])
}
auto
f_make_for_device_nchw
=
[
&
]()
{
#if USE_
DYNAMIC_
MODE
#if USE_MODE
const
auto
in_lengths_dev
=
make_tuple
(
N
,
C
,
Hi
,
Wi
);
const
auto
wei_lengths_dev
=
make_tuple
(
K
,
C
,
Y
,
X
);
const
auto
out_lengths_dev
=
make_tuple
(
N
,
K
,
Ho
,
Wo
);
...
...
@@ -260,7 +260,7 @@ int main(int argc, char* argv[])
};
auto
f_make_for_device_nhwc
=
[
&
]()
{
#if USE_
DYNAMIC_
MODE
#if USE_MODE
const
auto
in_lengths_dev
=
make_tuple
(
N
,
Hi
,
Wi
,
C
);
const
auto
wei_lengths_dev
=
make_tuple
(
K
,
Y
,
X
,
C
);
const
auto
out_lengths_dev
=
make_tuple
(
N
,
Ho
,
Wo
,
K
);
...
...
@@ -301,10 +301,9 @@ int main(int argc, char* argv[])
const
auto
tmp
=
f_make_for_device_nchw
();
device_
dynamic_
convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw
<
in_data_t
,
device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
...
...
@@ -328,10 +327,9 @@ int main(int argc, char* argv[])
const
auto
tmp
=
f_make_for_device_nhwc
();
device_
dynamic_
convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk
<
in_data_t
,
device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
...
...
@@ -355,10 +353,9 @@ int main(int argc, char* argv[])
const
auto
tmp
=
f_make_for_device_nchw
();
device_
dynamic_
convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw
<
in_data_t
,
device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
...
...
@@ -382,11 +379,10 @@ int main(int argc, char* argv[])
const
auto
tmp
=
f_make_for_device_nchw
();
device_
dynamic_
convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw
<
in_data_t
,
device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw
<
in_data_t
,
16
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
out_data_t
>
(
tmp
[
I0
],
tmp
[
I1
],
tmp
[
I2
],
tmp
[
I3
],
...
...
@@ -410,7 +406,7 @@ int main(int argc, char* argv[])
const
auto
tmp
=
f_make_for_device_nchw
();
device_
dynamic_
convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw
<
in_data_t
,
device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
...
...
@@ -437,7 +433,7 @@ int main(int argc, char* argv[])
const
auto
tmp
=
f_make_for_device_nhwc
();
device_
dynamic_
convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk
<
in_data_t
,
device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk
<
in_data_t
,
acc_data_t
,
out_data_t
>
(
tmp
[
I0
],
...
...
host/host_tensor/include/conv_common.hpp
View file @
c03045ce
#ifndef CONV_COMMON_HPP
#define CONV_COMMON_HPP
#include "
dynamic_
tensor_descriptor.hpp"
#include "tensor_descriptor.hpp"
enum
ConvTensorLayout
{
...
...
@@ -19,8 +19,8 @@ template <typename... InDesc,
typename
LeftPads
,
typename
RightPads
>
constexpr
auto
get_convolution_output_default_4d_tensor_descriptor
(
const
ck
::
Dynamic
TensorDescriptor
<
InDesc
...
>&
in_desc
,
const
ck
::
Dynamic
TensorDescriptor
<
WeiDesc
...
>&
wei_desc
,
const
ck
::
TensorDescriptor
<
InDesc
...
>&
in_desc
,
const
ck
::
TensorDescriptor
<
WeiDesc
...
>&
wei_desc
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
conv_dilations
,
const
LeftPads
&
left_pads
,
...
...
@@ -57,7 +57,7 @@ constexpr auto get_convolution_output_default_4d_tensor_descriptor(
const
auto
Ho
=
(
Hi
+
LeftPadH
+
RightPadH
-
YEff
)
/
conv_strides
[
I0
]
+
I1
;
const
auto
Wo
=
(
Wi
+
LeftPadW
+
RightPadW
-
XEff
)
/
conv_strides
[
I1
]
+
I1
;
return
make_
dynamic_
naive_tensor_descriptor_packed
_v2
(
make_tuple
(
N
,
K
,
Ho
,
Wo
));
return
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
K
,
Ho
,
Wo
));
}
template
<
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment