Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
26c42b94
Commit
26c42b94
authored
Oct 15, 2021
by
Jing Zhang
Browse files
conv-only use v5r1_add
parent
4eb9a7a4
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
194 additions
and
56 deletions
+194
-56
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2_add.hpp
...l/include/tensor_operation/gridwise_gemm_dlops_v2_add.hpp
+106
-17
host/driver_offline/include/device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
...ward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+0
-6
host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
...ward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+0
-4
host/driver_offline/include/driver_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
...ward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+37
-11
host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
...ward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+35
-3
host/driver_offline/include/driver_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
...ward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+16
-13
host/driver_offline/src/conv_add_fwd_driver_offline_nchwc.cpp
.../driver_offline/src/conv_add_fwd_driver_offline_nchwc.cpp
+0
-2
No files found.
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2_add.hpp
View file @
26c42b94
...
...
@@ -133,11 +133,56 @@ __global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
#endif
kernel_gemm_dlops_v2_add
(
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatC
*
__restrict__
p_bias_grid
,
FloatC
*
__restrict__
p_c_grid
,
FloatC
*
__restrict__
p_d_grid
)
kernel_gemm_dlops_v2_resize_add
(
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatC
*
__restrict__
p_bias_grid
,
FloatC
*
__restrict__
p_d_grid
)
{
constexpr
index_t
shared_block_size
=
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()
/
sizeof
(
FloatAB
);
__shared__
FloatAB
p_shared_block
[
shared_block_size
];
constexpr
auto
a_e0_e1_k0_k1_e2_grid_desc
=
AGridDesc_E0_E1_K0_K1_E2
{};
constexpr
auto
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
=
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
{};
constexpr
auto
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
=
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
{};
constexpr
auto
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc
=
DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx
{};
constexpr
auto
c_blockid_to_k_n_h_w_block_cluster_adaptor
=
CBlockIdToBlockClusterAdaptor_K_N_H_W
{};
GridwiseGemm
::
Run
(
p_a_grid
,
p_b_grid
,
p_bias_grid
,
nullptr
,
p_d_grid
,
p_shared_block
,
a_e0_e1_k0_k1_e2_grid_desc
,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
,
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc
,
c_blockid_to_k_n_h_w_block_cluster_adaptor
,
integral_constant
<
bool
,
HasMainE0BlockLoop
>
{});
}
template
<
typename
GridwiseGemm
,
typename
FloatAB
,
typename
FloatC
,
typename
AGridDesc_E0_E1_K0_K1_E2
,
typename
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
,
typename
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
,
typename
DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx
,
typename
CBlockIdToBlockClusterAdaptor_K_N_H_W
,
bool
HasMainE0BlockLoop
>
__global__
void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
#endif
kernel_gemm_dlops_v2_maxpool
(
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatC
*
__restrict__
p_bias_grid
,
FloatC
*
__restrict__
p_c_grid
,
FloatC
*
__restrict__
p_d_grid
)
{
constexpr
index_t
shared_block_size
=
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()
/
sizeof
(
FloatAB
);
...
...
@@ -165,6 +210,49 @@ __global__ void
c_blockid_to_k_n_h_w_block_cluster_adaptor
,
integral_constant
<
bool
,
HasMainE0BlockLoop
>
{});
}
template
<
typename
GridwiseGemm
,
typename
FloatAB
,
typename
FloatC
,
typename
AGridDesc_E0_E1_K0_K1_E2
,
typename
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
,
typename
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
,
typename
CBlockIdToBlockClusterAdaptor_K_N_H_W
,
bool
HasMainE0BlockLoop
>
__global__
void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
#endif
kernel_gemm_dlops_v2
(
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatC
*
__restrict__
p_bias_grid
,
FloatC
*
__restrict__
p_c_grid
)
{
constexpr
index_t
shared_block_size
=
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()
/
sizeof
(
FloatAB
);
__shared__
FloatAB
p_shared_block
[
shared_block_size
];
constexpr
auto
a_e0_e1_k0_k1_e2_grid_desc
=
AGridDesc_E0_E1_K0_K1_E2
{};
constexpr
auto
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
=
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
{};
constexpr
auto
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
=
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
{};
constexpr
auto
c_blockid_to_k_n_h_w_block_cluster_adaptor
=
CBlockIdToBlockClusterAdaptor_K_N_H_W
{};
GridwiseGemm
::
Run
(
p_a_grid
,
p_b_grid
,
p_bias_grid
,
p_c_grid
,
nullptr
,
p_shared_block
,
a_e0_e1_k0_k1_e2_grid_desc
,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
,
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
,
c_blockid_to_k_n_h_w_block_cluster_adaptor
,
integral_constant
<
bool
,
HasMainE0BlockLoop
>
{});
}
#endif
template
<
index_t
BlockSize
,
...
...
@@ -209,6 +297,8 @@ template <index_t BlockSize,
typename
AGlobalMoveSliceWindowStepHacks
,
typename
BGlobalMoveSliceWindowStepHacks
,
index_t
activ_type
=
0
,
index_t
bias_type
=
0
,
index_t
out_type
=
1
,
index_t
add_type
=
0
>
struct
GridwiseGemmDlops_km_kn_mn_v3_add
{
...
...
@@ -426,13 +516,13 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
const
auto
K1
=
Number
<
KPerBlock
>
{};
const
auto
K0
=
K
/
K1
;
const
auto
H2
=
HoPerThread
*
2
;
const
auto
H2
=
Number
<
HoPerThread
*
2
>
{}
;
const
auto
H1
=
Number
<
HoPerBlock
/
HoPerThread
>
{};
const
auto
H0
=
Hx
/
(
H1
*
H2
);
const
auto
H0
=
Number
<
Hx
/
(
H1
*
H2
)
>
{}
;
const
auto
W2
=
WoPerThread
*
2
;
const
auto
W2
=
Number
<
WoPerThread
*
2
>
{}
;
const
auto
W1
=
Number
<
WoPerBlock
/
WoPerThread
>
{};
const
auto
W0
=
Wx
/
(
W1
*
W2
);
const
auto
W0
=
Number
<
Wx
/
(
W1
*
W2
)
>
{}
;
const
auto
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc
=
transform_tensor_descriptor
(
d_k_n_hx_wx_grid_desc
,
...
...
@@ -449,11 +539,11 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
__host__
__device__
static
constexpr
auto
MakeDK0K1NH0H1HxW0W1WxGridDescriptor
(
const
DGridDesc_K_N_Hx_Wx
&
d_k_n_hx_wx_grid_desc
)
{
if
constexpr
(
add_type
==
0
)
if
constexpr
(
add_type
==
1
)
{
return
MakeDK0K1NH0H1HxW0W1WxGridDescriptorResizeAdd
(
d_k_n_hx_wx_grid_desc
);
}
else
if
constexpr
(
add_type
==
1
)
else
if
constexpr
(
add_type
==
2
)
{
return
MakeDK0K1NH0H1HxW0W1WxGridDescriptorMaxPool
(
d_k_n_hx_wx_grid_desc
);
}
...
...
@@ -544,9 +634,6 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
constexpr
auto
HasMainE1BlockLoop
=
CalculateHasMainE1BlockLoop
();
constexpr
auto
HasDoubleTailE1BlockLoop
=
CalculateHasDoubleTailE1BlockLoop
();
// const auto Ho = b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetLength(I3);
// const auto Wo = b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetLength(I4);
const
auto
c_k_n_h_w_block_cluster_idx
=
c_blockid_to_k_n_h_w_block_cluster_adaptor
.
CalculateBottomIndex
(
make_multi_index
(
get_block_1d_id
()));
...
...
@@ -909,6 +996,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
}
// activ
if
constexpr
(
activ_type
>
0
)
{
static_for
<
0
,
c_k1_n_h2_w2_thread_gemm_desc
.
GetElementSpaceSize
(),
1
>
{}([
&
](
auto
i
)
{
if
constexpr
(
activ_type
==
1
)
...
...
@@ -929,7 +1017,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
});
}
// Bias
if
constexpr
(
bias_type
==
1
)
{
constexpr
auto
bias_k0_k1_thread_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
I1
,
Number
<
KPerThread
>
{}));
...
...
@@ -982,6 +1070,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
#if 1
// Output
if
constexpr
(
out_type
==
1
)
{
// hack to control index calculation when iterating over c_k_n_h0_h1_h2_w0_w1_w2_global
// tensor
...
...
@@ -1032,7 +1121,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
#if 1
// Resize_Add
if
constexpr
(
add_type
==
0
)
if
constexpr
(
add_type
==
1
)
{
constexpr
auto
HoPerThreadx2
=
HoPerThread
*
2
;
constexpr
auto
WoPerThreadx2
=
WoPerThread
*
2
;
...
...
@@ -1145,7 +1234,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3_add
d_k_n_h0_h1_hx_w0_w1_wx_global_tensor_step_hacks
);
}
// MaxPool
else
if
constexpr
(
add_type
==
1
)
else
if
constexpr
(
add_type
==
2
)
{
static_assert
(
HoPerThread
%
2
==
0
&&
WoPerThread
%
2
==
0
,
""
);
...
...
host/driver_offline/include/device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
View file @
26c42b94
...
...
@@ -27,7 +27,6 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
const
Tensor
<
TInWei
>&
in_n_c0_hi_wi_c1
,
const
Tensor
<
TInWei
>&
wei_k_c0_y_x_c1
,
const
Tensor
<
TOut
>&
bias_k0_k1
,
Tensor
<
TOut
>&
out_n_k0_ho_wo_k1
,
const
Tensor
<
TOut
>&
add_n_k0_hox2_wox2_k1
,
Tensor
<
TOut
>&
add_n_k0_hox2_wox2_k1_out
,
ck
::
index_t
nrepeat
)
...
...
@@ -64,8 +63,6 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
in_n_c0_hi_wi_c1
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_k_c0_y_x_c1_device_buf
(
sizeof
(
TInWei
)
*
wei_k_c0_y_x_c1
.
mDesc
.
GetElementSpace
());
DeviceMem
bias_k0_k1_device_buf
(
sizeof
(
TOut
)
*
bias_k0_k1
.
mDesc
.
GetElementSpace
());
DeviceMem
out_n_k0_ho_wo_k1_device_buf
(
sizeof
(
TOut
)
*
out_n_k0_ho_wo_k1
.
mDesc
.
GetElementSpace
());
DeviceMem
add_n_k0_hox2_wox2_k1_device_buf
(
sizeof
(
TOut
)
*
add_n_k0_hox2_wox2_k1
.
mDesc
.
GetElementSpace
());
...
...
@@ -191,7 +188,6 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
in_n_c0_hi_wi_c1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
bias_k0_k1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
out_n_k0_ho_wo_k1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
add_n_k0_hox2_wox2_k1_device_buf
.
GetDeviceBuffer
()),
nrepeat
);
...
...
@@ -219,10 +215,8 @@ void device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
in_n_c0_hi_wi_c1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
bias_k0_k1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
out_n_k0_ho_wo_k1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
add_n_k0_hox2_wox2_k1_device_buf
.
GetDeviceBuffer
()),
0
);
out_n_k0_ho_wo_k1_device_buf
.
FromDevice
(
out_n_k0_ho_wo_k1
.
mData
.
data
());
add_n_k0_hox2_wox2_k1_device_buf
.
FromDevice
(
add_n_k0_hox2_wox2_k1_out
.
mData
.
data
());
}
host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
View file @
26c42b94
...
...
@@ -132,10 +132,6 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1
const
auto
out_n_k0_ho_wo_k1_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
K0
,
Ho
,
Wo
,
K1
));
static_assert
(
in_n_c0_hi_wi_c1_desc
.
IsKnownAtCompileTime
(),
""
);
static_assert
(
wei_k_c0_y_x_c1_desc
.
IsKnownAtCompileTime
(),
""
);
static_assert
(
out_n_k0_ho_wo_k1_desc
.
IsKnownAtCompileTime
(),
""
);
constexpr
auto
conv_driver
=
DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_outpad
<
BlockSize
,
...
...
host/driver_offline/include/driver_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
View file @
26c42b94
...
...
@@ -49,7 +49,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatC
*
__restrict__
p_bias_grid
,
FloatC
*
__restrict__
p_c_grid
,
FloatC
*
__restrict__
p_d_grid
,
const
int
nrepeat
)
const
{
...
...
@@ -338,7 +337,11 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
decltype
(
d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_global_tensor_step_hacks
),
decltype
(
a_e0_e1_k_e2_global_move_slice_window_step_hack
),
decltype
(
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack
),
activ_type
>
;
activ_type
,
1
,
// bias_type
0
,
// out_type
1
// add_type
>
;
const
auto
a_e0_e1_k0_k1_e2_grid_desc
=
GridwiseGemm
::
MakeAE0E1K0K1E2GridDescriptor
(
a_e0_e1_k_e2_grid_desc
);
...
...
@@ -374,7 +377,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
if
(
has_main_e0_block_loop
)
{
const
auto
kernel
=
kernel_gemm_dlops_v2_add
<
const
auto
kernel
=
kernel_gemm_dlops_v2_
resize_
add
<
GridwiseGemm
,
FloatAB
,
FloatC
,
...
...
@@ -393,7 +396,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
p_a_grid
,
p_b_grid
,
p_bias_grid
,
p_c_grid
,
p_d_grid
,
a_e0_e1_k0_k1_e2_grid_desc
,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
...
...
@@ -403,7 +405,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
}
else
{
const
auto
kernel
=
kernel_gemm_dlops_v2_add
<
const
auto
kernel
=
kernel_gemm_dlops_v2_
resize_
add
<
GridwiseGemm
,
FloatAB
,
FloatC
,
...
...
@@ -422,7 +424,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
p_a_grid
,
p_b_grid
,
p_bias_grid
,
p_c_grid
,
p_d_grid
,
a_e0_e1_k0_k1_e2_grid_desc
,
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
...
...
@@ -455,7 +456,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
if
(
has_main_e0_block_loop
)
{
const
auto
kernel
=
kernel_gemm_dlops_v2_add
<
const
auto
kernel
=
kernel_gemm_dlops_v2_
resize_
add
<
GridwiseGemm
,
FloatAB
,
FloatC
,
...
...
@@ -475,7 +476,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
p_a_grid
,
p_b_grid
,
p_bias_grid
,
p_c_grid
,
p_d_grid
,
cast_pointer_to_constant_address_space
(
a_e0_e1_k0_k1_e2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
...
...
@@ -490,8 +490,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
}
else
{
const
auto
kernel
=
kernel_gemm_dlops_v2_add
<
const
auto
kernel
=
kernel_gemm_dlops_v2_resize_add
<
GridwiseGemm
,
FloatAB
,
FloatC
,
...
...
@@ -511,7 +510,6 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
p_a_grid
,
p_b_grid
,
p_bias_grid
,
p_c_grid
,
p_d_grid
,
cast_pointer_to_constant_address_space
(
a_e0_e1_k0_k1_e2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
...
...
@@ -524,7 +522,35 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
cast_pointer_to_constant_address_space
(
c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
()));
}
#elif CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
{
static_assert
(
a_e0_e1_k_e2_grid_desc
.
IsKnownAtCompileTime
(),
""
);
static_assert
(
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
.
IsKnownAtCompileTime
(),
""
);
static_assert
(
d_k0_k1_n_h0_h1_h2x2_w0_w1_w2x2_grid_desc
.
IsKnownAtCompileTime
(),
""
);
static_assert
(
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
.
IsKnownAtCompileTime
(),
""
);
static_assert
(
c_blockid_to_k_n_h_w_block_cluster_adaptor
.
IsKnownAtCompileTime
(),
""
);
const
auto
kernel
=
kernel_gemm_dlops_v2_resize_add
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AGridDesc_E0_E1_K0_K1_E2
>
,
remove_reference_t
<
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
>
,
remove_reference_t
<
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
>
,
remove_reference_t
<
DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2
>
,
remove_reference_t
<
CBlockIdToBlockClusterAdaptor_K_N_H_W
>
,
has_main_e0_block_loop
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
p_a_grid
,
p_b_grid
,
p_bias_grid
,
p_d_grid
);
}
#endif
return
ave_time
;
}
...
...
host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
View file @
26c42b94
...
...
@@ -4,7 +4,7 @@
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_gemm_dlops_v2.hpp"
#include "gridwise_gemm_dlops_v2
_add
.hpp"
template
<
ck
::
index_t
BlockSize
,
typename
FloatAB
,
...
...
@@ -259,7 +259,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
static_assert
(
c_k_n_hop_wop_grid_desc
.
IsKnownAtCompileTime
(),
""
);
// GEMM
using
GridwiseGemm
=
GridwiseGemmDlops_km_kn_mn_v3
<
using
GridwiseGemm
=
GridwiseGemmDlops_km_kn_mn_v3
_add
<
BlockSize
,
FloatAB
,
FloatAcc
,
...
...
@@ -268,6 +268,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
decltype
(
a_e0_e1_k_e2_grid_desc
),
decltype
(
b_e0_e1_n_ho_wo_e2_grid_desc
),
decltype
(
c_k_n_hop_wop_grid_desc
),
decltype
(
c_k_n_hop_wop_grid_desc
),
E1
,
E2
,
K2
,
...
...
@@ -298,9 +299,14 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
decltype
(
a_e0_e1_k_e2_global_step_hacks
),
decltype
(
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks
),
decltype
(
c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks
),
decltype
(
c_k0_k1_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks
),
decltype
(
a_e0_e1_k_e2_global_move_slice_window_step_hack
),
decltype
(
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack
),
activ_type
>
;
activ_type
,
0
,
// bias_type
1
,
// out_type
0
// add_type
>
;
const
auto
a_e0_e1_k0_k1_e2_grid_desc
=
GridwiseGemm
::
MakeAE0E1K0K1E2GridDescriptor
(
a_e0_e1_k_e2_grid_desc
);
...
...
@@ -469,7 +475,33 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
cast_pointer_to_constant_address_space
(
c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
()));
}
#elif CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
{
static_assert
(
a_e0_e1_k_e2_grid_desc
.
IsKnownAtCompileTime
(),
""
);
static_assert
(
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
.
IsKnownAtCompileTime
(),
""
);
static_assert
(
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
.
IsKnownAtCompileTime
(),
""
);
static_assert
(
c_blockid_to_k_n_h_w_block_cluster_adaptor
.
IsKnownAtCompileTime
(),
""
);
const
auto
kernel
=
kernel_gemm_dlops_v2
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AGridDesc_E0_E1_K0_K1_E2
>
,
remove_reference_t
<
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
>
,
remove_reference_t
<
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
>
,
remove_reference_t
<
CBlockIdToBlockClusterAdaptor_K_N_H_W
>
,
has_main_e0_block_loop
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
p_a_grid
,
p_b_grid
,
p_bias_grid
,
p_c_grid
);
}
#endif
return
ave_time
;
}
...
...
host/driver_offline/include/driver_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
View file @
26c42b94
...
...
@@ -323,8 +323,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
Sequence
<
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
>
,
// E0, E1, N, H0, H1, H2, W0, W1, W2, E2
9
,
BThreadTransferSrcScalarPerVector_E2
,
false
,
// don't move back src coordinate after threadwise copy, which will be fused
with
// MoveSrcSliceWindow() to save addr computation
false
,
// don't move back src coordinate after threadwise copy, which will be fused
//
with
MoveSrcSliceWindow() to save addr computation
Sequence
<
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
>
,
// K0, K1, N, H0, H1, I2, H2, W0, W1, I2, W2
1
,
CThreadTransferDstScalarPerVector_K
,
...
...
@@ -335,7 +335,10 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
decltype
(
a_e0_e1_k_e2_global_move_slice_window_step_hack
),
decltype
(
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_move_slice_window_step_hack
),
activ_type
,
1
>
;
1
,
// bias_type
1
,
// out_type
2
// add_type
>
;
const
auto
a_e0_e1_k0_k1_e2_grid_desc
=
GridwiseGemm
::
MakeAE0E1K0K1E2GridDescriptor
(
a_e0_e1_k_e2_grid_desc
);
...
...
@@ -528,16 +531,16 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
static_assert
(
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc
.
IsKnownAtCompileTime
(),
""
);
static_assert
(
c_blockid_to_k_n_h_w_block_cluster_adaptor
.
IsKnownAtCompileTime
(),
""
);
const
auto
kernel
=
kernel_gemm_dlops_v2_add
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AGridDesc_E0_E1_K0_K1_E2
>
,
remove_reference_t
<
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
>
,
remove_reference_t
<
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
>
,
remove_reference_t
<
DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx
>
,
remove_reference_t
<
CBlockIdToBlockClusterAdaptor_K_N_H_W
>
,
has_main_e0_block_loop
>
;
const
auto
kernel
=
kernel_gemm_dlops_v2_maxpool
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AGridDesc_E0_E1_K0_K1_E2
>
,
remove_reference_t
<
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
>
,
remove_reference_t
<
CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2
>
,
remove_reference_t
<
DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx
>
,
remove_reference_t
<
CBlockIdToBlockClusterAdaptor_K_N_H_W
>
,
has_main_e0_block_loop
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
...
...
host/driver_offline/src/conv_add_fwd_driver_offline_nchwc.cpp
View file @
26c42b94
...
...
@@ -214,7 +214,6 @@ int main(int argc, char* argv[])
Tensor
<
in_data_t
>
in
(
in_lengths_host
);
Tensor
<
in_data_t
>
wei
(
wei_lengths_host
);
Tensor
<
in_data_t
>
add
(
add_lengths_host
);
Tensor
<
out_data_t
>
out_device
(
out_lengths_host
);
Tensor
<
in_data_t
>
add_device
(
add_lengths_host
);
Tensor
<
in_data_t
>
add_host
(
add_lengths_host
);
Tensor
<
out_data_t
>
bias
(
bias_lengths_host
);
...
...
@@ -308,7 +307,6 @@ int main(int argc, char* argv[])
in
,
wei
,
bias
,
out_device
,
add
,
add_device
,
nrepeat
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment