Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
4eb9a7a4
"git@developer.sourcefind.cn:modelzoo/resnet50_tensorflow.git" did not exist on "f2c76e41f668b30cc6714ed8a866c07cc2f2275a"
Commit
4eb9a7a4
authored
Oct 14, 2021
by
Jing Zhang
Browse files
enable static desc
parent
a69937d3
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
142 additions
and
63 deletions
+142
-63
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2_add.hpp
...l/include/tensor_operation/gridwise_gemm_dlops_v2_add.hpp
+65
-11
composable_kernel/include/utility/config.hpp
composable_kernel/include/utility/config.hpp
+2
-1
host/driver_offline/include/driver_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
...ward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+75
-51
No files found.
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2_add.hpp
View file @
4eb9a7a4
...
@@ -119,6 +119,52 @@ __global__ void
...
@@ -119,6 +119,52 @@ __global__ void
c_blockid_to_k_n_h_w_block_cluster_adaptor
,
c_blockid_to_k_n_h_w_block_cluster_adaptor
,
integral_constant
<
bool
,
HasMainE0BlockLoop
>
{});
integral_constant
<
bool
,
HasMainE0BlockLoop
>
{});
}
}
#elif CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
template
<
typename
GridwiseGemm
,
typename
FloatAB
,
typename
FloatC
,
typename
AGridDesc_E0_E1_K0_K1_E2
,
typename
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
,
typename
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
,
typename
DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx
,
typename
CBlockIdToBlockClusterAdaptor_K_N_H_W
,
bool
HasMainE0BlockLoop
>
__global__
void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
#endif
kernel_gemm_dlops_v2_add
(
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatC
*
__restrict__
p_bias_grid
,
FloatC
*
__restrict__
p_c_grid
,
FloatC
*
__restrict__
p_d_grid
)
{
constexpr
index_t
shared_block_size
=
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()
/
sizeof
(
FloatAB
);
__shared__
FloatAB
p_shared_block
[
shared_block_size
];
constexpr
auto
a_e0_e1_k0_k1_e2_grid_desc
=
AGridDesc_E0_E1_K0_K1_E2
{};
constexpr
auto
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
=
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
{};
constexpr
auto
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
=
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
{};
constexpr
auto
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc
=
DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx
{};
constexpr
auto
c_blockid_to_k_n_h_w_block_cluster_adaptor
=
CBlockIdToBlockClusterAdaptor_K_N_H_W
{};
GridwiseGemm
::
Run
(
p_a_grid
,
p_b_grid
,
p_bias_grid
,
p_c_grid
,
p_d_grid
,
p_shared_block
,
a_e0_e1_k0_k1_e2_grid_desc
,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
,
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc
,
c_blockid_to_k_n_h_w_block_cluster_adaptor
,
integral_constant
<
bool
,
HasMainE0BlockLoop
>
{});
}
#endif
#endif
template
<
index_t
BlockSize
,
template
<
index_t
BlockSize
,
...
@@ -339,13 +385,23 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
...
@@ -339,13 +385,23 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
const
auto
K1
=
Number
<
KPerBlock
>
{};
const
auto
K1
=
Number
<
KPerBlock
>
{};
const
auto
K0
=
K
/
K1
;
const
auto
K0
=
K
/
K1
;
const
auto
H2
=
HoPerThread
/
2
;
#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
const
auto
H2
=
Number
<
HoPerThread
/
2
>
{};
const
auto
H1
=
Number
<
HoPerBlock
/
HoPerThread
>
{};
const
auto
H1
=
Number
<
HoPerBlock
/
HoPerThread
>
{};
const
auto
H0
=
Number
<
Hx
/
(
H1
*
H2
)
>
{};
const
auto
W2
=
Number
<
WoPerThread
/
2
>
{};
const
auto
W1
=
Number
<
WoPerBlock
/
WoPerThread
>
{};
const
auto
W0
=
Number
<
Wx
/
(
W1
*
W2
)
>
{};
#else
const
auto
H2
=
HoPerThread
/
2
;
const
auto
H1
=
HoPerBlock
/
HoPerThread
;
const
auto
H0
=
Hx
/
(
H1
*
H2
);
const
auto
H0
=
Hx
/
(
H1
*
H2
);
const
auto
W2
=
WoPerThread
/
2
;
const
auto
W2
=
WoPerThread
/
2
;
const
auto
W1
=
Number
<
WoPerBlock
/
WoPerThread
>
{}
;
const
auto
W1
=
WoPerBlock
/
WoPerThread
;
const
auto
W0
=
Wx
/
(
W1
*
W2
);
const
auto
W0
=
Wx
/
(
W1
*
W2
);
#endif
const
auto
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc
=
transform_tensor_descriptor
(
const
auto
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc
=
transform_tensor_descriptor
(
d_k_n_hx_wx_grid_desc
,
d_k_n_hx_wx_grid_desc
,
...
@@ -415,10 +471,17 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
...
@@ -415,10 +471,17 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
const
auto
Ho
=
c_k_n_ho_wo_grid_desc
.
GetLength
(
I2
);
const
auto
Ho
=
c_k_n_ho_wo_grid_desc
.
GetLength
(
I2
);
const
auto
Wo
=
c_k_n_ho_wo_grid_desc
.
GetLength
(
I3
);
const
auto
Wo
=
c_k_n_ho_wo_grid_desc
.
GetLength
(
I3
);
#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
const
auto
K0
=
Number
<
K
/
KPerBlock
>
{};
const
auto
N0
=
Number
<
N
/
NPerBlock
>
{};
const
auto
H0
=
Number
<
Ho
/
HoPerBlock
>
{};
const
auto
W0
=
Number
<
Wo
/
WoPerBlock
>
{};
#else
const
auto
K0
=
K
/
KPerBlock
;
const
auto
K0
=
K
/
KPerBlock
;
const
auto
N0
=
N
/
NPerBlock
;
const
auto
N0
=
N
/
NPerBlock
;
const
auto
H0
=
Ho
/
HoPerBlock
;
const
auto
H0
=
Ho
/
HoPerBlock
;
const
auto
W0
=
Wo
/
WoPerBlock
;
const
auto
W0
=
Wo
/
WoPerBlock
;
#endif
const
auto
c_blockid_to_k_n_ho_wo_block_cluster_adaptor
=
make_single_stage_tensor_adaptor
(
const
auto
c_blockid_to_k_n_ho_wo_block_cluster_adaptor
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_merge_transform
(
make_tuple
(
K0
,
N0
,
H0
,
W0
))),
make_tuple
(
make_merge_transform
(
make_tuple
(
K0
,
N0
,
H0
,
W0
))),
...
@@ -464,15 +527,6 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
...
@@ -464,15 +527,6 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
const
CBlockIdToBlockClusterAdaptor_K_N_H_W
&
c_blockid_to_k_n_h_w_block_cluster_adaptor
,
const
CBlockIdToBlockClusterAdaptor_K_N_H_W
&
c_blockid_to_k_n_h_w_block_cluster_adaptor
,
integral_constant
<
bool
,
HasMainE0BlockLoop
>
)
integral_constant
<
bool
,
HasMainE0BlockLoop
>
)
{
{
// constexpr auto a_e0_e1_k0_k1_e2_grid_desc = AGridDesc_E0_E1_K0_K1_E2{};
// constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc =
// BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2{};
// constexpr auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc =
// DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx{};
// constexpr auto c_blockid_to_k_n_h_w_block_cluster_adaptor =
// CBlockIdToBlockClusterAdaptor_K_N_H_W{};
const
auto
bias_k0_k1_grid_desc
=
const
auto
bias_k0_k1_grid_desc
=
MakeBiasK0K1GridDescriptor
(
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc
);
MakeBiasK0K1GridDescriptor
(
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc
);
...
...
composable_kernel/include/utility/config.hpp
View file @
4eb9a7a4
...
@@ -90,8 +90,9 @@
...
@@ -90,8 +90,9 @@
#endif
#endif
// pass tensor descriptor by value or void*
// pass tensor descriptor by value or void*
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
1
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
0
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0
#define CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR 1
// merge transformation use magic number division
// merge transformation use magic number division
#define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 0
#define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 0
...
...
host/driver_offline/include/driver_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
View file @
4eb9a7a4
...
@@ -183,12 +183,12 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
...
@@ -183,12 +183,12 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
// max tensor
// max tensor
const
auto
d_k_n_h
opx2_wopx2
_grid_desc
=
transform_tensor_descriptor
(
const
auto
d_k_n_h
x_wx
_grid_desc
=
transform_tensor_descriptor
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
K0
,
Hx
,
Wx
,
K1
)),
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
K0
,
Hx
,
Wx
,
K1
)),
make_tuple
(
make_merge_transform
(
make_tuple
(
K0
,
K1
)),
make_tuple
(
make_merge_transform
(
make_tuple
(
K0
,
K1
)),
make_pass_through_transform
(
N
),
make_pass_through_transform
(
N
),
make_pad_transform
(
Hx
,
I0
,
Number
<
OutRightPadH
/
2
>
{}),
make_pad_transform
(
Hx
,
I0
,
Number
<
OutRightPadH
/
2
>
{}),
make_pad_transform
(
Wx
,
I0
,
Number
<
OutRightPadW
/
2
>
{})),
make_pad_transform
(
Wx
,
I0
,
Number
<
OutRightPadW
/
2
>
{})),
make_tuple
(
Sequence
<
1
,
4
>
{},
Sequence
<
0
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
1
,
4
>
{},
Sequence
<
0
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
...
@@ -289,10 +289,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
...
@@ -289,10 +289,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}));
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}));
// clang-format on
// clang-format on
static_assert
(
a_e0_e1_k_e2_grid_desc
.
IsKnownAtCompileTime
(),
""
);
static_assert
(
b_e0_e1_n_ho_wo_e2_grid_desc
.
IsKnownAtCompileTime
(),
""
);
static_assert
(
d_k_n_hopx2_wopx2_grid_desc
.
IsKnownAtCompileTime
(),
""
);
static_assert
(
c_k_n_hop_wop_grid_desc
.
IsKnownAtCompileTime
(),
""
);
// GEMM
// GEMM
using
GridwiseGemm
=
GridwiseGemmDlops_km_kn_mn_v3_add
<
using
GridwiseGemm
=
GridwiseGemmDlops_km_kn_mn_v3_add
<
...
@@ -304,7 +300,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
...
@@ -304,7 +300,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
decltype
(
a_e0_e1_k_e2_grid_desc
),
decltype
(
a_e0_e1_k_e2_grid_desc
),
decltype
(
b_e0_e1_n_ho_wo_e2_grid_desc
),
decltype
(
b_e0_e1_n_ho_wo_e2_grid_desc
),
decltype
(
c_k_n_hop_wop_grid_desc
),
decltype
(
c_k_n_hop_wop_grid_desc
),
decltype
(
d_k_n_h
opx2_wopx2
_grid_desc
),
decltype
(
d_k_n_h
x_wx
_grid_desc
),
E1
,
E1
,
E2
,
E2
,
K2
,
K2
,
...
@@ -348,14 +344,13 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
...
@@ -348,14 +344,13 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
const
auto
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
=
const
auto
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
=
GridwiseGemm
::
MakeCK0K1NH0H1H2W0W1W2GridDescriptor
(
c_k_n_hop_wop_grid_desc
);
GridwiseGemm
::
MakeCK0K1NH0H1H2W0W1W2GridDescriptor
(
c_k_n_hop_wop_grid_desc
);
const
auto
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc
=
const
auto
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc
=
GridwiseGemm
::
MakeDK0K1NH0H1HxW0W1WxGridDescriptor
(
d_k_n_h
opx2_wopx2
_grid_desc
);
GridwiseGemm
::
MakeDK0K1NH0H1HxW0W1WxGridDescriptor
(
d_k_n_h
x_wx
_grid_desc
);
using
AGridDesc_E0_E1_K0_K1_E2
=
decltype
(
a_e0_e1_k0_k1_e2_grid_desc
);
using
AGridDesc_E0_E1_K0_K1_E2
=
decltype
(
a_e0_e1_k0_k1_e2_grid_desc
);
using
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
=
using
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
=
decltype
(
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
);
decltype
(
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
);
using
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
=
decltype
(
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
);
using
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
=
decltype
(
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
);
using
DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx
=
using
DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx
=
decltype
(
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc
);
decltype
(
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc
);
const
auto
grid_size
=
(
K
/
KPerBlock
)
*
(
Hop
/
HoPerBlock
)
*
(
Wop
/
WoPerBlock
)
*
N
;
const
auto
grid_size
=
(
K
/
KPerBlock
)
*
(
Hop
/
HoPerBlock
)
*
(
Wop
/
WoPerBlock
)
*
N
;
...
@@ -375,16 +370,16 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
...
@@ -375,16 +370,16 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
if
(
has_main_e0_block_loop
)
if
(
has_main_e0_block_loop
)
{
{
const
auto
kernel
=
kernel_gemm_dlops_v2_add
<
const
auto
kernel
=
GridwiseGemm
,
kernel_gemm_dlops_v2_add
<
GridwiseGemm
,
FloatAB
,
FloatAB
,
FloatC
,
FloatC
,
remove_reference_t
<
AGridDesc_E0_E1_K0_K1_E2
>
,
remove_reference_t
<
AGridDesc_E0_E1_K0_K1_E2
>
,
remove_reference_t
<
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
>
,
remove_reference_t
<
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
>
,
remove_reference_t
<
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
>
,
remove_reference_t
<
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
>
,
remove_reference_t
<
DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx
>
,
remove_reference_t
<
DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx
>
,
remove_reference_t
<
CBlockIdToBlockClusterAdaptor_K_N_H_W
>
,
remove_reference_t
<
CBlockIdToBlockClusterAdaptor_K_N_H_W
>
,
true
>
;
true
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
nrepeat
,
...
@@ -404,16 +399,16 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
...
@@ -404,16 +399,16 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
}
}
else
else
{
{
const
auto
kernel
=
kernel_gemm_dlops_v2_add
<
const
auto
kernel
=
GridwiseGemm
,
kernel_gemm_dlops_v2_add
<
GridwiseGemm
,
FloatAB
,
FloatAB
,
FloatC
,
FloatC
,
remove_reference_t
<
AGridDesc_E0_E1_K0_K1_E2
>
,
remove_reference_t
<
AGridDesc_E0_E1_K0_K1_E2
>
,
remove_reference_t
<
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
>
,
remove_reference_t
<
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
>
,
remove_reference_t
<
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
>
,
remove_reference_t
<
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
>
,
remove_reference_t
<
DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx
>
,
remove_reference_t
<
DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx
>
,
remove_reference_t
<
CBlockIdToBlockClusterAdaptor_K_N_H_W
>
,
remove_reference_t
<
CBlockIdToBlockClusterAdaptor_K_N_H_W
>
,
false
>
;
false
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
nrepeat
,
...
@@ -456,16 +451,16 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
...
@@ -456,16 +451,16 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
if
(
has_main_e0_block_loop
)
if
(
has_main_e0_block_loop
)
{
{
const
auto
kernel
=
kernel_gemm_dlops_v2_add
<
const
auto
kernel
=
GridwiseGemm
,
kernel_gemm_dlops_v2_add
<
GridwiseGemm
,
FloatAB
,
FloatAB
,
FloatC
,
FloatC
,
remove_reference_t
<
AGridDesc_E0_E1_K0_K1_E2
>
,
remove_reference_t
<
AGridDesc_E0_E1_K0_K1_E2
>
,
remove_reference_t
<
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
>
,
remove_reference_t
<
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
>
,
remove_reference_t
<
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
>
,
remove_reference_t
<
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
>
,
remove_reference_t
<
DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx
>
,
remove_reference_t
<
DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx
>
,
remove_reference_t
<
CBlockIdToBlockClusterAdaptor_K_N_H_W
>
,
remove_reference_t
<
CBlockIdToBlockClusterAdaptor_K_N_H_W
>
,
true
>
;
true
>
;
ave_time
=
launch_and_time_kernel
(
ave_time
=
launch_and_time_kernel
(
kernel
,
kernel
,
...
@@ -492,16 +487,16 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
...
@@ -492,16 +487,16 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
else
else
{
{
const
auto
kernel
=
kernel_gemm_dlops_v2_add
<
const
auto
kernel
=
GridwiseGemm
,
kernel_gemm_dlops_v2_add
<
GridwiseGemm
,
FloatAB
,
FloatAB
,
FloatC
,
FloatC
,
remove_reference_t
<
AGridDesc_E0_E1_K0_K1_E2
>
,
remove_reference_t
<
AGridDesc_E0_E1_K0_K1_E2
>
,
remove_reference_t
<
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
>
,
remove_reference_t
<
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
>
,
remove_reference_t
<
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
>
,
remove_reference_t
<
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
>
,
remove_reference_t
<
DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx
>
,
remove_reference_t
<
DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx
>
,
remove_reference_t
<
CBlockIdToBlockClusterAdaptor_K_N_H_W
>
,
remove_reference_t
<
CBlockIdToBlockClusterAdaptor_K_N_H_W
>
,
false
>
;
false
>
;
ave_time
=
launch_and_time_kernel
(
ave_time
=
launch_and_time_kernel
(
kernel
,
kernel
,
...
@@ -525,7 +520,36 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
...
@@ -525,7 +520,36 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
cast_pointer_to_constant_address_space
(
cast_pointer_to_constant_address_space
(
c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
()));
c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
()));
}
}
#elif CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
{
static_assert
(
a_e0_e1_k_e2_grid_desc
.
IsKnownAtCompileTime
(),
""
);
static_assert
(
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
.
IsKnownAtCompileTime
(),
""
);
static_assert
(
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc
.
IsKnownAtCompileTime
(),
""
);
static_assert
(
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
.
IsKnownAtCompileTime
(),
""
);
static_assert
(
c_blockid_to_k_n_h_w_block_cluster_adaptor
.
IsKnownAtCompileTime
(),
""
);
const
auto
kernel
=
kernel_gemm_dlops_v2_add
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AGridDesc_E0_E1_K0_K1_E2
>
,
remove_reference_t
<
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
>
,
remove_reference_t
<
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
>
,
remove_reference_t
<
DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx
>
,
remove_reference_t
<
CBlockIdToBlockClusterAdaptor_K_N_H_W
>
,
has_main_e0_block_loop
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
p_a_grid
,
p_b_grid
,
p_bias_grid
,
p_c_grid
,
p_d_grid
);
}
#endif
#endif
return
ave_time
;
return
ave_time
;
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment