Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
20e6bc9d
"...composable_kernel_rocm.git" did not exist on "6014185ac65e75f2a84cb67ef6ba83b48ae0fcb3"
Commit
20e6bc9d
authored
Oct 04, 2021
by
Jing Zhang
Browse files
clean code
parent
8f3c4d86
Changes
5
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
107 additions
and
612 deletions
+107
-612
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
...ernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
+16
-14
composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
...el/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
+1
-1
host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
...ution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+2
-0
host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
...ution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+87
-596
host/driver_offline/src/conv_fwd_driver_offline.cpp
host/driver_offline/src/conv_fwd_driver_offline.cpp
+1
-1
No files found.
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
View file @
20e6bc9d
...
@@ -28,7 +28,7 @@ __global__ void
...
@@ -28,7 +28,7 @@ __global__ void
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
FloatC
*
__restrict__
p_c_grid
,
FloatC
*
__restrict__
p_c_grid
,
const
AGridDesc_E0_E1_K0_K1_E2
A_E
0_
E
1_
K
0_
K
1_
E
2_grid_desc
,
const
AGridDesc_E0_E1_K0_K1_E2
a_e
0_
e
1_
k
0_
k
1_
e
2_grid_desc
,
const
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
const
BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc
,
const
CGridDesc_K_N_H0_H1_H2_W0_W1_W2
c_k_n_h0_h1_h2_w0_w1_w2_grid_desc
,
const
CGridDesc_K_N_H0_H1_H2_W0_W1_W2
c_k_n_h0_h1_h2_w0_w1_w2_grid_desc
,
const
CBlockIdToBlockClusterAdaptor_K_N_H_W
c_blockid_to_k_n_h_w_block_cluster_adaptor
)
const
CBlockIdToBlockClusterAdaptor_K_N_H_W
c_blockid_to_k_n_h_w_block_cluster_adaptor
)
...
@@ -114,6 +114,7 @@ template <index_t BlockSize,
...
@@ -114,6 +114,7 @@ template <index_t BlockSize,
typename
CGridDesc_K_N_Ho_Wo
,
typename
CGridDesc_K_N_Ho_Wo
,
index_t
E1_
,
index_t
E1_
,
index_t
E2_
,
index_t
E2_
,
index_t
K2_
,
index_t
KPerBlock
,
index_t
KPerBlock
,
index_t
HoPerBlock
,
index_t
HoPerBlock
,
index_t
WoPerBlock
,
index_t
WoPerBlock
,
...
@@ -152,10 +153,11 @@ struct GridwiseGemmDlops_km_kn_mn_v3
...
@@ -152,10 +153,11 @@ struct GridwiseGemmDlops_km_kn_mn_v3
static
constexpr
auto
I4
=
Number
<
4
>
{};
static
constexpr
auto
I4
=
Number
<
4
>
{};
static
constexpr
auto
I5
=
Number
<
5
>
{};
static
constexpr
auto
I5
=
Number
<
5
>
{};
static
constexpr
auto
E1
=
Number
<
E1_
>
{};
static
constexpr
auto
E1
=
Number
<
E1_
>
{};
static
constexpr
auto
E2
=
Number
<
E2_
>
{};
static
constexpr
auto
E2
=
Number
<
E2_
>
{};
static
constexpr
auto
K2
=
Number
<
K2_
>
{};
static
constexpr
auto
NPerBlock
=
I1
;
static
constexpr
auto
NPerBlock
=
I1
;
static
constexpr
auto
K2
=
2
;
__host__
__device__
static
constexpr
index_t
GetSharedMemoryNumberOfByte
()
__host__
__device__
static
constexpr
index_t
GetSharedMemoryNumberOfByte
()
{
{
...
@@ -181,12 +183,12 @@ struct GridwiseGemmDlops_km_kn_mn_v3
...
@@ -181,12 +183,12 @@ struct GridwiseGemmDlops_km_kn_mn_v3
const
auto
Ho
=
c_k_n_ho_wo_grid_desc
.
GetLength
(
I2
);
const
auto
Ho
=
c_k_n_ho_wo_grid_desc
.
GetLength
(
I2
);
const
auto
Wo
=
c_k_n_ho_wo_grid_desc
.
GetLength
(
I3
);
const
auto
Wo
=
c_k_n_ho_wo_grid_desc
.
GetLength
(
I3
);
const
auto
K0
=
K
/
KPerBlock
;
const
auto
K0
=
K
/
KPerBlock
;
const
auto
N0
=
N
/
NPerBlock
;
const
auto
N0
=
N
/
NPerBlock
;
const
auto
H
o
0
=
Ho
/
HoPerBlock
;
const
auto
H0
=
Ho
/
HoPerBlock
;
const
auto
W
o
0
=
Wo
/
WoPerBlock
;
const
auto
W0
=
Wo
/
WoPerBlock
;
const
index_t
grid_size
=
K0
*
N0
*
H
o
0
*
W
o
0
;
const
index_t
grid_size
=
K0
*
N0
*
H0
*
W0
;
return
grid_size
;
return
grid_size
;
}
}
...
@@ -314,13 +316,13 @@ struct GridwiseGemmDlops_km_kn_mn_v3
...
@@ -314,13 +316,13 @@ struct GridwiseGemmDlops_km_kn_mn_v3
const
auto
Ho
=
c_k_n_ho_wo_grid_desc
.
GetLength
(
I2
);
const
auto
Ho
=
c_k_n_ho_wo_grid_desc
.
GetLength
(
I2
);
const
auto
Wo
=
c_k_n_ho_wo_grid_desc
.
GetLength
(
I3
);
const
auto
Wo
=
c_k_n_ho_wo_grid_desc
.
GetLength
(
I3
);
const
auto
K0
=
K
/
KPerBlock
;
const
auto
K0
=
K
/
KPerBlock
;
const
auto
N0
=
N
/
NPerBlock
;
const
auto
N0
=
N
/
NPerBlock
;
const
auto
H
o
0
=
Ho
/
HoPerBlock
;
const
auto
H0
=
Ho
/
HoPerBlock
;
const
auto
W
o
0
=
Wo
/
WoPerBlock
;
const
auto
W0
=
Wo
/
WoPerBlock
;
const
auto
c_blockid_to_k_n_ho_wo_block_cluster_adaptor
=
make_single_stage_tensor_adaptor
(
const
auto
c_blockid_to_k_n_ho_wo_block_cluster_adaptor
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_merge_transform
(
make_tuple
(
K0
,
N0
,
H
o
0
,
W
o
0
))),
make_tuple
(
make_merge_transform
(
make_tuple
(
K0
,
N0
,
H0
,
W0
))),
make_tuple
(
Sequence
<
0
,
1
,
2
,
3
>
{}),
make_tuple
(
Sequence
<
0
,
1
,
2
,
3
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
make_tuple
(
Sequence
<
0
>
{}));
...
...
composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp
View file @
20e6bc9d
...
@@ -43,7 +43,7 @@ __global__ void
...
@@ -43,7 +43,7 @@ __global__ void
p_shared_block
,
p_shared_block
,
a_k0_m_k1_grid_desc
,
a_k0_m_k1_grid_desc
,
b_k0_n_k1_grid_desc
,
b_k0_n_k1_grid_desc
,
c_m0_
n0_m1_n1_m2_m3_m4_n2
_grid_desc
,
c_m0_
m1_m2_n
_grid_desc
,
c_block_cluster_adaptor
);
c_block_cluster_adaptor
);
}
}
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
...
...
host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
View file @
20e6bc9d
...
@@ -124,6 +124,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
...
@@ -124,6 +124,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
constexpr
index_t
E1
=
2
*
9
;
constexpr
index_t
E1
=
2
*
9
;
constexpr
index_t
E2
=
1
;
constexpr
index_t
E2
=
1
;
constexpr
index_t
K2
=
2
;
constexpr
index_t
E1PerBlock
=
2
;
constexpr
index_t
E1PerBlock
=
2
;
constexpr
index_t
KPerThread
=
8
;
constexpr
index_t
KPerThread
=
8
;
...
@@ -151,6 +152,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
...
@@ -151,6 +152,7 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
TOut
,
TOut
,
E1
,
E1
,
E2
,
E2
,
K2
,
KPerBlock
,
KPerBlock
,
HoPerBlock
,
HoPerBlock
,
WoPerBlock
,
WoPerBlock
,
...
...
host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
View file @
20e6bc9d
This diff is collapsed.
Click to expand it.
host/driver_offline/src/conv_fwd_driver_offline.cpp
View file @
20e6bc9d
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#include <initializer_list>
#include <initializer_list>
#include <cstdlib>
#include <cstdlib>
#include <stdlib.h>
#include <stdlib.h>
#include <half.hpp>
//
#include <half.hpp>
#include "config.hpp"
#include "config.hpp"
#include "print.hpp"
#include "print.hpp"
#include "device.hpp"
#include "device.hpp"
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment