Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
40694062
Commit
40694062
authored
Sep 30, 2021
by
Jing Zhang
Browse files
add adaptors
parent
3e5e4cf7
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
377 additions
and
229 deletions
+377
-229
composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
...rnel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
+37
-39
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
...ernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
+174
-63
composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp
...nel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp
+79
-70
host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
...ution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+2
-2
host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
...ution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
+73
-43
host/driver_offline/src/conv_fwd_driver_offline.cpp
host/driver_offline/src/conv_fwd_driver_offline.cpp
+10
-10
script/run.sh
script/run.sh
+2
-2
No files found.
composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp
View file @
40694062
...
...
@@ -13,7 +13,8 @@ template <index_t BlockSize,
typename
ABlockDesc_E1_K_E2
,
typename
BBlockDesc_E1_N_Ho_Wo_E2
,
typename
CThreadDesc_K_N_Ho_Wo
,
index_t
EPerThreadLoop
>
index_t
EPerThreadLoop
,
index_t
KPerThreadLoop
>
struct
BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
...
...
@@ -22,27 +23,21 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
static
constexpr
auto
I3
=
Number
<
3
>
{};
static
constexpr
auto
I4
=
Number
<
4
>
{};
struct
MatrixIndex
{
index_t
k
;
index_t
n
;
index_t
h
;
index_t
w
;
};
using
AIndex
=
MultiIndex
<
3
>
;
using
BIndex
=
MultiIndex
<
3
>
;
using
CIndex
=
MultiIndex
<
4
>
;
static
constexpr
auto
E1
=
ABlockDesc_E1_K_E2
{}.
GetLength
(
I0
);
static
constexpr
auto
K
=
ABlockDesc_E1_K_E2
{}.
GetLength
(
I1
);
static
constexpr
auto
E2
=
ABlockDesc_E1_K_E2
{}.
GetLength
(
I2
);
static
constexpr
auto
E1
=
ABlockDesc_E1_K_E2
{}.
GetLength
(
I0
);
static
constexpr
auto
K
PerBlock
=
ABlockDesc_E1_K_E2
{}.
GetLength
(
I1
);
static
constexpr
auto
E2
=
ABlockDesc_E1_K_E2
{}.
GetLength
(
I2
);
static
constexpr
auto
H
=
BBlockDesc_E1_N_Ho_Wo_E2
{}.
GetLength
(
I2
);
static
constexpr
auto
W
=
BBlockDesc_E1_N_Ho_Wo_E2
{}.
GetLength
(
I3
);
static
constexpr
auto
H
PerBlock
=
BBlockDesc_E1_N_Ho_Wo_E2
{}.
GetLength
(
I2
);
static
constexpr
auto
W
PerBlock
=
BBlockDesc_E1_N_Ho_Wo_E2
{}.
GetLength
(
I3
);
static
constexpr
auto
KPerThread
=
CThreadDesc_K_N_Ho_Wo
{}.
GetLength
(
I0
);
static
constexpr
auto
HPerThread
=
CThreadDesc_K_N_Ho_Wo
{}.
GetLength
(
I2
);
static
constexpr
auto
WPerThread
=
CThreadDesc_K_N_Ho_Wo
{}.
GetLength
(
I3
);
static
constexpr
index_t
KPerThreadLoop
=
2
;
static
constexpr
auto
a_thread_mtx_
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
EPerThreadLoop
>
{},
Number
<
KPerThreadLoop
>
{},
Number
<
E2
>
{}));
...
...
@@ -57,8 +52,8 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
Number
<
KPerThreadLoop
>
{},
Number
<
1
>
{},
Number
<
HPerThread
>
{},
Number
<
WPerThread
>
{}));
__device__
BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
()
:
c_thread_
be
gin_
mtx
_idx_
{
GetBeginOfCThreadDesc_K_N_Ho_Wo
(
get_thread_local_1d_id
())},
a_thread_copy_
{
make_tuple
(
0
,
c_thread_
be
gin_
mtx
_idx_
.
k
*
KPerThread
,
0
)}
:
c_thread_
ori
gin_
data
_idx_
{
GetBeginOfCThreadDesc_K_N_Ho_Wo
(
get_thread_local_1d_id
())},
a_thread_copy_
{
make_tuple
(
0
,
c_thread_
ori
gin_
data
_idx_
[
I0
]
*
KPerThread
,
0
)}
{
static_assert
(
ABlockDesc_E1_K_E2
::
IsKnownAtCompileTime
()
&&
BBlockDesc_E1_N_Ho_Wo_E2
::
IsKnownAtCompileTime
()
&&
...
...
@@ -73,12 +68,13 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
static_assert
(
E1
%
EPerThreadLoop
==
0
,
""
);
static_assert
(
KPerThread
%
KPerThreadLoop
==
0
,
""
);
static_assert
(
K
%
KPerThread
==
0
&&
H
%
HPerThread
==
0
&&
W
%
WPerThread
==
0
,
static_assert
(
KPerBlock
%
KPerThread
==
0
&&
HPerBlock
%
HPerThread
==
0
&&
WPerBlock
%
WPerThread
==
0
,
"wrong! Cannot evenly divide work among
\n
"
);
constexpr
auto
KThreadCluster
=
K
/
KPerThread
;
constexpr
auto
HThreadCluster
=
H
/
HPerThread
;
constexpr
auto
WThreadCluster
=
W
/
WPerThread
;
constexpr
auto
KThreadCluster
=
K
PerBlock
/
KPerThread
;
constexpr
auto
HThreadCluster
=
H
PerBlock
/
HPerThread
;
constexpr
auto
WThreadCluster
=
W
PerBlock
/
WPerThread
;
static_assert
(
BlockSize
==
KThreadCluster
*
HThreadCluster
*
WThreadCluster
,
"wrong! wrong blocksize
\n
"
);
...
...
@@ -86,25 +82,27 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
__device__
static
constexpr
auto
GetCThreadDesc_K_N_Ho_WoLengths
()
{
return
Sequence
<
KPerThread
,
1
,
HPerThread
,
WPerThread
>
{};
return
Sequence
<
KPerThread
,
I
1
,
HPerThread
,
WPerThread
>
{};
}
__device__
static
Matrix
Index
GetBeginOfCThreadDesc_K_N_Ho_Wo
(
index_t
thread_id
)
__device__
static
C
Index
GetBeginOfCThreadDesc_K_N_Ho_Wo
(
index_t
thread_id
)
{
constexpr
index_t
HPerBlock
=
BBlockDesc_E1_N_Ho_Wo_E2
{}.
GetLength
(
I2
);
constexpr
index_t
WPerBlock
=
BBlockDesc_E1_N_Ho_Wo_E2
{}.
GetLength
(
I3
);
constexpr
auto
num_w_threads
=
WPerBlock
/
WPerThread
;
constexpr
auto
num_h_threads
=
HPerBlock
/
HPerThread
;
constexpr
auto
num_hw_threads
=
num_w_threads
*
num_h_threads
;
index_t
k_thread_id
=
thread_id
/
num_hw_threads
;
index_t
hw_thread_id
=
thread_id
%
num_hw_threads
;
index_t
h_thread_id
=
hw_thread_id
/
num_w_threads
;
index_t
w_thread_id
=
hw_thread_id
%
num_w_threads
;
return
MatrixIndex
{
k_thread_id
,
1
,
h_thread_id
,
w_thread_id
};
constexpr
auto
K0
=
KPerBlock
/
KPerThread
;
constexpr
auto
N0
=
I1
;
constexpr
auto
H0
=
HPerBlock
/
HPerThread
;
constexpr
auto
W0
=
WPerBlock
/
WPerThread
;
constexpr
auto
c_threadid_to_k_n_h_w_thread_cluster_adaptor
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_merge_transform
(
make_tuple
(
K0
,
N0
,
H0
,
W0
))),
make_tuple
(
Sequence
<
0
,
1
,
2
,
3
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
c_k_n_h_w_thread_cluster_idx
=
c_threadid_to_k_n_h_w_thread_cluster_adaptor
.
CalculateBottomIndex
(
make_multi_index
(
thread_id
));
return
c_k_n_h_w_thread_cluster_idx
;
}
template
<
typename
ABlockBuffer
,
typename
BThreadBuffer
,
typename
CThreadBuffer
>
...
...
@@ -157,8 +155,6 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
}
private:
MatrixIndex
c_thread_begin_mtx_idx_
;
using
AThreadCopy
=
ThreadwiseTensorSliceTransfer_v4
<
FloatA
,
FloatA
,
...
...
@@ -170,6 +166,8 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
E2
,
E2
>
;
CIndex
c_thread_origin_data_idx_
;
AThreadCopy
a_thread_copy_
;
};
...
...
composable_kernel/include/tensor_operation/gridwise_gemm_dlops_v2.hpp
View file @
40694062
...
...
@@ -19,9 +19,7 @@ template <typename GridwiseGemm,
typename
BGridDesc_E0_E1_N_Ho_Wo_E2
,
typename
CGridDesc_K_N_Ho_Wo
,
typename
CBlockIdToBlockClusterAdaptor_K_N_Ho_Wo
,
bool
HasMainE0BlockLoop
,
bool
HasMainE1BlockLoop
,
bool
HasDoubleTailE1BlockLoop
>
bool
HasMainE0BlockLoop
>
__global__
void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
...
...
@@ -47,9 +45,8 @@ __global__ void
a_e0_e1_k_e2_grid_desc
,
b_e0_e1_n_ho_wo_e2_grid_desc
,
c_k_n_ho_wo_grid_desc
,
integral_constant
<
bool
,
HasMainE0BlockLoop
>
{},
integral_constant
<
bool
,
HasMainE1BlockLoop
>
{},
integral_constant
<
bool
,
HasDoubleTailE1BlockLoop
>
{});
c_blockid_to_k_n_ho_wo_block_cluster_adaptor
,
integral_constant
<
bool
,
HasMainE0BlockLoop
>
{});
}
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
// pass tensor descriptor by CONSTANT void pointer
...
...
@@ -62,9 +59,7 @@ template <typename GridwiseGemm,
typename
BGridDesc_E0_E1_N_Ho_Wo_E2
,
typename
CGridDesc_K_N_Ho_Wo
,
typename
CBlockIdToBlockClusterAdaptor_K_N_Ho_Wo
,
bool
HasMainE0BlockLoop
,
bool
HasMainE1BlockLoop
,
bool
HasDoubleTailE1BlockLoop
>
bool
HasMainE0BlockLoop
>
__global__
void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
...
...
@@ -86,6 +81,9 @@ __global__ void
cast_pointer_to_generic_address_space
(
p_b_e0_e1_n_ho_wo_e2_grid_desc
));
const
auto
c_k_n_ho_wo_grid_desc
=
*
reinterpret_cast
<
const
CGridDesc_K_N_Ho_Wo
*>
(
cast_pointer_to_generic_address_space
(
p_c_k_n_ho_wo_grid_desc
));
const
auto
c_blockid_to_k_n_ho_wo_block_cluster_adaptor
=
*
reinterpret_cast
<
const
CBlockIdToBlockClusterAdaptor_K_N_Ho_Wo
*>
(
cast_pointer_to_generic_address_space
(
p_c_blockid_to_k_n_ho_wo_block_cluster_adaptor
));
constexpr
index_t
shared_block_size
=
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()
/
sizeof
(
FloatAB
);
...
...
@@ -99,9 +97,8 @@ __global__ void
a_e0_e1_k_e2_grid_desc
,
b_e0_e1_n_ho_wo_e2_grid_desc
,
c_k_n_ho_wo_grid_desc
,
integral_constant
<
bool
,
HasMainE0BlockLoop
>
{},
integral_constant
<
bool
,
HasMainE1BlockLoop
>
{},
integral_constant
<
bool
,
HasDoubleTailE1BlockLoop
>
{});
c_blockid_to_k_n_ho_wo_block_cluster_adaptor
,
integral_constant
<
bool
,
HasMainE0BlockLoop
>
{});
}
#endif
...
...
@@ -110,9 +107,9 @@ template <index_t BlockSize,
typename
FloatAcc
,
typename
FloatC
,
InMemoryDataOperationEnum_t
CGlobalMemoryDataOperation
,
typename
AG
lobal
Desc_E0_E1_K_E2
,
typename
BG
lobal
Desc_E0_E1_N_Ho_Wo_E2
,
typename
CG
lobal
Desc_K_N_Ho_Wo
,
typename
AG
rid
Desc_E0_E1_K_E2
,
typename
BG
rid
Desc_E0_E1_N_Ho_Wo_E2
,
typename
CG
rid
Desc_K_N_Ho_Wo
,
index_t
E1_
,
index_t
E2_
,
index_t
KPerBlock
,
...
...
@@ -153,8 +150,10 @@ struct GridwiseGemmDlops_km_kn_mn_v3
static
constexpr
auto
I4
=
Number
<
4
>
{};
static
constexpr
auto
I5
=
Number
<
5
>
{};
static
constexpr
auto
E1
=
Number
<
E1_
>
{};
static
constexpr
auto
E2
=
Number
<
E2_
>
{};
static
constexpr
auto
E1
=
Number
<
E1_
>
{};
static
constexpr
auto
E2
=
Number
<
E2_
>
{};
static
constexpr
auto
NPerBlock
=
I1
;
static
constexpr
auto
K2
=
2
;
__host__
__device__
static
constexpr
index_t
GetSharedMemoryNumberOfByte
()
{
...
...
@@ -172,17 +171,143 @@ struct GridwiseGemmDlops_km_kn_mn_v3
return
a_block_space_size
*
sizeof
(
FloatAB
);
}
template
<
bool
HasMainE0BlockLoop
,
bool
HasMainE1BlockLoop
,
bool
HasDoubleTailE1BlockLoop
>
__device__
static
void
Run
(
const
FloatAB
*
__restrict__
p_a_global
,
const
FloatAB
*
__restrict__
p_b_global
,
FloatC
*
__restrict__
p_c_global
,
FloatAB
*
__restrict__
p_shared_block
,
const
AGlobalDesc_E0_E1_K_E2
&
a_e0_e1_k_e2_global_desc
,
const
BGlobalDesc_E0_E1_N_Ho_Wo_E2
&
b_e0_e1_n_ho_wo_e2_global_desc
,
const
CGlobalDesc_K_N_Ho_Wo
&
c_k_n_ho_wo_global_desc
,
integral_constant
<
bool
,
HasMainE0BlockLoop
>
,
integral_constant
<
bool
,
HasMainE1BlockLoop
>
,
integral_constant
<
bool
,
HasDoubleTailE1BlockLoop
>
)
__host__
__device__
static
constexpr
index_t
CalculateGridSize
(
const
CGridDesc_K_N_Ho_Wo
&
c_k_n_ho_wo_grid_desc
)
{
const
auto
K
=
c_k_n_ho_wo_grid_desc
.
GetLength
(
I0
);
const
auto
N
=
c_k_n_ho_wo_grid_desc
.
GetLength
(
I1
);
const
auto
Ho
=
c_k_n_ho_wo_grid_desc
.
GetLength
(
I2
);
const
auto
Wo
=
c_k_n_ho_wo_grid_desc
.
GetLength
(
I3
);
const
auto
K0
=
K
/
KPerBlock
;
const
auto
N0
=
N
/
NPerBlock
;
const
auto
Ho_0
=
Ho
/
HoPerBlock
;
const
auto
Wo_0
=
Wo
/
WoPerBlock
;
const
index_t
grid_size
=
K0
*
N0
*
Ho_0
*
Wo_0
;
return
grid_size
;
}
__host__
__device__
static
constexpr
bool
CalculateHasMainE0BlockLoop
(
const
index_t
E0
)
{
const
bool
has_main_e0_block_loop
=
E0
>
1
;
return
has_main_e0_block_loop
;
}
__host__
__device__
static
constexpr
bool
CalculateHasMainE1BlockLoop
()
{
const
bool
has_main_e1_block_loop
=
(
E1
+
E1PerBlock
)
/
(
2
*
E1PerBlock
)
>
1
;
return
has_main_e1_block_loop
;
}
__host__
__device__
static
constexpr
bool
CalculateHasDoubleTailE1BlockLoop
()
{
const
bool
has_double_tail_e1_block_loop
=
(
E1
/
E1PerBlock
)
%
2
==
0
;
return
has_double_tail_e1_block_loop
;
}
__host__
__device__
static
constexpr
auto
MakeAE0E1K0K1E2GridDescriptor
(
const
AGridDesc_E0_E1_K_E2
&
a_e0_e1_k_e2_grid_desc
)
{
const
auto
E0
=
a_e0_e1_k_e2_grid_desc
.
GetLength
(
I0
);
// const auto E1 = a_e0_e1_k_e2_grid_desc.GetLength(I1);
const
auto
K
=
a_e0_e1_k_e2_grid_desc
.
GetLength
(
I2
);
// const auto E2 = a_e0_e1_k_e2_grid_desc.GetLength(I3);
const
auto
K1
=
Number
<
KPerBlock
>
{};
const
auto
K0
=
K
/
K1
;
const
auto
a_e0_e1_k0_k1_e2_grid_desc
=
transform_tensor_descriptor
(
a_e0_e1_k_e2_grid_desc
,
make_tuple
(
make_pass_through_transform
(
E0
),
make_pass_through_transform
(
E1
),
make_unmerge_transform
(
make_tuple
(
K0
,
K1
)),
make_pass_through_transform
(
E2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
>
{}));
return
a_e0_e1_k0_k1_e2_grid_desc
;
}
__host__
__device__
static
constexpr
auto
MakeBE0E1NH0H1W0W1E2GridDescriptor
(
const
BGridDesc_E0_E1_N_Ho_Wo_E2
&
b_e0_e1_n_ho_wo_e2_grid_desc
)
{
const
auto
E0
=
b_e0_e1_n_ho_wo_e2_grid_desc
.
GetLength
(
I0
);
// const auto E1 = b_e0_e1_n_ho_wo_e2_grid_desc.GetLength(I1);
const
auto
N
=
b_e0_e1_n_ho_wo_e2_grid_desc
.
GetLength
(
I2
);
const
auto
Ho
=
b_e0_e1_n_ho_wo_e2_grid_desc
.
GetLength
(
I3
);
const
auto
Wo
=
b_e0_e1_n_ho_wo_e2_grid_desc
.
GetLength
(
I4
);
// const auto E2 = b_e0_e1_n_ho_wo_e2_grid_desc.GetLength(I5);
const
auto
H1
=
Number
<
HoPerBlock
>
{};
const
auto
H0
=
Ho
/
H1
;
const
auto
W1
=
Number
<
WoPerBlock
>
{};
const
auto
W0
=
Wo
/
W1
;
const
auto
b_e0_e1_n_h0_h1_w0_w1_e2_grid_desc
=
transform_tensor_descriptor
(
b_e0_e1_n_ho_wo_e2_grid_desc
,
make_tuple
(
make_pass_through_transform
(
E0
),
make_pass_through_transform
(
E1
),
make_pass_through_transform
(
N
),
make_unmerge_transform
(
make_tuple
(
H0
,
H1
)),
make_unmerge_transform
(
make_tuple
(
W0
,
W1
)),
make_pass_through_transform
(
E2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{},
Sequence
<
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
,
4
>
{},
Sequence
<
5
,
6
>
{},
Sequence
<
7
>
{}));
return
b_e0_e1_n_h0_h1_w0_w1_e2_grid_desc
;
}
__host__
__device__
static
constexpr
auto
MakeCBlockIdToKNHoWoBlockClusterAdaptor
(
const
CGridDesc_K_N_Ho_Wo
&
c_k_n_ho_wo_grid_desc
)
{
const
auto
K
=
c_k_n_ho_wo_grid_desc
.
GetLength
(
I0
);
const
auto
N
=
c_k_n_ho_wo_grid_desc
.
GetLength
(
I1
);
const
auto
Ho
=
c_k_n_ho_wo_grid_desc
.
GetLength
(
I2
);
const
auto
Wo
=
c_k_n_ho_wo_grid_desc
.
GetLength
(
I3
);
const
auto
K0
=
K
/
KPerBlock
;
const
auto
N0
=
N
/
NPerBlock
;
const
auto
Ho_0
=
Ho
/
HoPerBlock
;
const
auto
Wo_0
=
Wo
/
WoPerBlock
;
const
auto
c_blockid_to_k_n_ho_wo_block_cluster_adaptor
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_merge_transform
(
make_tuple
(
K0
,
N0
,
Ho_0
,
Wo_0
))),
make_tuple
(
Sequence
<
0
,
1
,
2
,
3
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
return
c_blockid_to_k_n_ho_wo_block_cluster_adaptor
;
}
using
CBlockIdToBlockClusterAdaptor_K_N_Ho_Wo
=
decltype
(
MakeCBlockIdToKNHoWoBlockClusterAdaptor
(
CGridDesc_K_N_Ho_Wo
{}));
template
<
bool
HasMainE0BlockLoop
>
__device__
static
void
Run
(
const
FloatAB
*
__restrict__
p_a_global
,
const
FloatAB
*
__restrict__
p_b_global
,
FloatC
*
__restrict__
p_c_global
,
FloatAB
*
__restrict__
p_shared_block
,
const
AGridDesc_E0_E1_K_E2
&
a_e0_e1_k_e2_global_desc
,
const
BGridDesc_E0_E1_N_Ho_Wo_E2
&
b_e0_e1_n_ho_wo_e2_global_desc
,
const
CGridDesc_K_N_Ho_Wo
&
c_k_n_ho_wo_global_desc
,
const
CBlockIdToBlockClusterAdaptor_K_N_Ho_Wo
&
c_blockid_to_k_n_ho_wo_block_cluster_adaptor
,
integral_constant
<
bool
,
HasMainE0BlockLoop
>
)
{
const
auto
a_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_a_global
,
a_e0_e1_k_e2_global_desc
.
GetElementSpaceSize
());
...
...
@@ -191,40 +316,24 @@ struct GridwiseGemmDlops_km_kn_mn_v3
auto
c_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_c_global
,
c_k_n_ho_wo_global_desc
.
GetElementSpaceSize
());
static_assert
(
E1
%
E1PerBlock
==
0
,
""
);
// const auto E = a_e0_e1_k_e2_global_desc.GetLength(I0);
// const auto K = a_e0_e1_k_e2_global_desc.GetLength(I1);
constexpr
auto
HasMainE1BlockLoop
=
CalculateHasMainE1BlockLoop
();
constexpr
auto
HasDoubleTailE1BlockLoop
=
CalculateHasDoubleTailE1BlockLoop
();
// const auto N = b_e0_e1_n_ho_wo_e2_global_desc.GetLength(I1);
const
auto
Ho
=
b_e0_e1_n_ho_wo_e2_global_desc
.
GetLength
(
I3
);
const
auto
Wo
=
b_e0_e1_n_ho_wo_e2_global_desc
.
GetLength
(
I4
);
// const auto Ho = b_e0_e1_n_ho_wo_e2_global_desc.GetLength(I3);
// const auto Wo = b_e0_e1_n_ho_wo_e2_global_desc.GetLength(I4);
// divide block work by [M, N]
#if 0
const auto ho_block_work_num = Ho / Number<HoPerBlock>{};
const auto wo_block_work_num = Wo / Number<WoPerBlock>{};
const auto hwo_block_work_num = ho_block_work_num * wo_block_work_num;
const index_t k_block_work_id = get_block_1d_id() / hwo_block_work_num;
const index_t hwo_block_work_id = get_block_1d_id() - k_block_work_id * hwo_block_work_num;
const index_t ho_block_work_id = hwo_block_work_id / wo_block_work_num;
const index_t wo_block_work_id = hwo_block_work_id - ho_block_work_id * wo_block_work_num;
#else
// Hack: this force result into SGPR
const
index_t
ho_block_work_num
=
__builtin_amdgcn_readfirstlane
(
Ho
/
HoPerBlock
);
const
index_t
wo_block_work_num
=
__builtin_amdgcn_readfirstlane
(
Wo
/
WoPerBlock
);
const
index_t
hwo_block_work_num
=
ho_block_work_num
*
wo_block_work_num
;
const
auto
c_k_n_ho_wo_block_cluster_idx
=
c_blockid_to_k_n_ho_wo_block_cluster_adaptor
.
CalculateBottomIndex
(
make_multi_index
(
get_block_1d_id
()));
const
index_t
k_block_work_id
=
__builtin_amdgcn_readfirstlane
(
get_block_1d_id
()
/
hwo_block_work_num
);
const
index_t
hwo
_block_work_id
=
get_block_1d_id
()
-
k_block_work_id
*
hwo_block_work_num
;
__builtin_amdgcn_readfirstlane
(
c_k_n_ho_wo_block_cluster_idx
[
I0
]
);
const
index_t
n
_block_work_id
=
__builtin_amdgcn_readfirstlane
(
c_k_n_ho_wo_block_cluster_idx
[
I1
]);
const
index_t
ho_block_work_id
=
__builtin_amdgcn_readfirstlane
(
hwo_block_work_id
/
wo_block_work_num
);
const
index_t
wo_block_work_id
=
hwo_block_work_id
-
ho_block_work_id
*
wo_block_work_num
;
#endif
__builtin_amdgcn_readfirstlane
(
c_k_n_ho_wo_block_cluster_idx
[
I2
]
);
const
index_t
wo_block_work_id
=
__builtin_amdgcn_readfirstlane
(
c_k_n_ho_wo_block_cluster_idx
[
I3
]);
// lds max alignment
constexpr
auto
max_lds_align
=
Number
<
ABlockTransferDstScalarPerVector_E2
>
{};
...
...
@@ -259,16 +368,18 @@ struct GridwiseGemmDlops_km_kn_mn_v3
decltype
(
a_e1_k_e2_block_desc
),
decltype
(
b_e1_n_ho_wo_e2_block_desc
),
decltype
(
c_k_n_ho_wo_thread_desc
),
EPerThread
>
{};
EPerThread
,
K2
>
{};
auto
c_thread_mtx_index
=
blockwise_gemm
.
GetBeginOfCThreadDesc_K_N_Ho_Wo
(
get_thread_local_1d_id
());
const
auto
k_thread_id
=
c_thread_mtx_index
.
k
;
const
auto
ho_thread_id
=
c_thread_mtx_index
.
h
;
const
auto
wo_thread_id
=
c_thread_mtx_index
.
w
;
const
auto
k_thread_id
=
c_thread_mtx_index
[
I0
]
;
const
auto
ho_thread_id
=
c_thread_mtx_index
[
I2
]
;
const
auto
wo_thread_id
=
c_thread_mtx_index
[
I3
]
;
const
index_t
k_block_data_on_global
=
k_block_work_id
*
KPerBlock
;
const
index_t
n_block_data_on_global
=
n_block_work_id
*
HoPerBlock
;
const
index_t
ho_block_data_on_global
=
ho_block_work_id
*
HoPerBlock
;
const
index_t
wo_block_data_on_global
=
wo_block_work_id
*
WoPerBlock
;
...
...
@@ -320,7 +431,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
FloatAB
,
decltype
(
b_e0_e1_n_ho_wo_e2_global_desc
),
decltype
(
b_e0_e1_n_ho_wo_e2_thread_desc
),
Sequence
<
I1
,
E1PerBlock
,
1
,
HoPerThread
,
WoPerThread
,
E2
>
,
Sequence
<
I1
,
E1PerBlock
,
NPerBlock
,
HoPerThread
,
WoPerThread
,
E2
>
,
BBlockTransferSrcAccessOrder
,
BBlockTransferSrcVectorDim
,
BBlockTransferSrcScalarPerVector
,
...
...
@@ -346,7 +457,7 @@ struct GridwiseGemmDlops_km_kn_mn_v3
// initialize output thread tensor
ThreadwiseTensorSliceSet_v1
<
FloatAcc
,
decltype
(
c_k_n_ho_wo_thread_desc
),
Sequence
<
KPerThread
,
1
,
HoPerThread
,
WoPerThread
>>
{}
Sequence
<
KPerThread
,
NPerBlock
,
HoPerThread
,
WoPerThread
>>
{}
.
Run
(
c_k_n_ho_wo_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
c_thread_buf
,
FloatAcc
{
0
});
constexpr
auto
b_thread_slice_copy_step
=
make_multi_index
(
0
,
E1PerBlock
,
0
,
0
,
0
,
0
);
...
...
composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp
View file @
40694062
...
...
@@ -71,84 +71,93 @@ struct ThreadwiseGemmDlops_km_kn_mn_v3
constexpr
auto
b_origin_idx
=
to_multi_index
(
BOriginIdx
{});
constexpr
auto
c_origin_idx
=
to_multi_index
(
COriginIdx
{});
#if 1
constexpr
index_t
SubHW
=
2
;
static_for
<
0
,
K
,
1
>
{}([
&
](
auto
k
)
{
static_for
<
0
,
Ho
,
SubHW
>
{}([
&
](
auto
h
)
{
static_for
<
0
,
Wo
,
SubHW
>
{}([
&
](
auto
w
)
{
static_for
<
0
,
E1
,
1
>
{}([
&
](
auto
e1
)
{
static_for
<
0
,
E2
,
1
>
{}([
&
](
auto
e2
)
{
constexpr
index_t
a_offset
=
AThreadDesc_E1_K_E2
{}.
CalculateOffset
(
a_origin_idx
+
make_tuple
(
e1
,
k
,
e2
));
constexpr
index_t
b0_offset
=
BThreadDesc_E1_N_Ho_Wo_E2
{}.
CalculateOffset
(
b_origin_idx
+
make_tuple
(
e1
,
0
,
h
,
w
,
e2
));
constexpr
index_t
b1_offset
=
BThreadDesc_E1_N_Ho_Wo_E2
{}.
CalculateOffset
(
b_origin_idx
+
make_tuple
(
e1
,
0
,
h
,
w
+
1
,
e2
));
constexpr
index_t
b2_offset
=
BThreadDesc_E1_N_Ho_Wo_E2
{}.
CalculateOffset
(
b_origin_idx
+
make_tuple
(
e1
,
0
,
h
+
1
,
w
,
e2
));
constexpr
index_t
b3_offset
=
BThreadDesc_E1_N_Ho_Wo_E2
{}.
CalculateOffset
(
b_origin_idx
+
make_tuple
(
e1
,
0
,
h
+
1
,
w
+
1
,
e2
));
constexpr
index_t
c0_offset
=
CThreadDesc_K_N_Ho_Wo
{}.
CalculateOffset
(
c_origin_idx
+
make_tuple
(
k
,
0
,
h
,
w
));
constexpr
index_t
c1_offset
=
CThreadDesc_K_N_Ho_Wo
{}.
CalculateOffset
(
c_origin_idx
+
make_tuple
(
k
,
0
,
h
,
w
+
1
));
constexpr
index_t
c2_offset
=
CThreadDesc_K_N_Ho_Wo
{}.
CalculateOffset
(
c_origin_idx
+
make_tuple
(
k
,
0
,
h
+
1
,
w
));
constexpr
index_t
c3_offset
=
CThreadDesc_K_N_Ho_Wo
{}.
CalculateOffset
(
c_origin_idx
+
make_tuple
(
k
,
0
,
h
+
1
,
w
+
1
));
amd_assembly_outer_product_1x4
(
a_buf
[
Number
<
a_offset
>
{}],
b_buf
[
Number
<
b0_offset
>
{}],
b_buf
[
Number
<
b1_offset
>
{}],
b_buf
[
Number
<
b2_offset
>
{}],
b_buf
[
Number
<
b3_offset
>
{}],
c_buf
(
Number
<
c0_offset
>
{}),
c_buf
(
Number
<
c1_offset
>
{}),
c_buf
(
Number
<
c2_offset
>
{}),
c_buf
(
Number
<
c3_offset
>
{}));
if
constexpr
((
Ho
%
2
==
0
)
&&
(
Wo
%
2
==
0
))
{
constexpr
auto
SubHW
=
2
;
static_for
<
0
,
K
,
1
>
{}([
&
](
auto
k
)
{
static_for
<
0
,
Ho
,
SubHW
>
{}([
&
](
auto
h
)
{
static_for
<
0
,
Wo
,
SubHW
>
{}([
&
](
auto
w
)
{
static_for
<
0
,
E1
,
1
>
{}([
&
](
auto
e1
)
{
static_for
<
0
,
E2
,
1
>
{}([
&
](
auto
e2
)
{
constexpr
index_t
a_offset
=
AThreadDesc_E1_K_E2
{}.
CalculateOffset
(
a_origin_idx
+
make_tuple
(
e1
,
k
,
e2
));
constexpr
index_t
b0_offset
=
BThreadDesc_E1_N_Ho_Wo_E2
{}.
CalculateOffset
(
b_origin_idx
+
make_tuple
(
e1
,
0
,
h
,
w
,
e2
));
constexpr
index_t
b1_offset
=
BThreadDesc_E1_N_Ho_Wo_E2
{}.
CalculateOffset
(
b_origin_idx
+
make_tuple
(
e1
,
0
,
h
,
w
+
1
,
e2
));
constexpr
index_t
b2_offset
=
BThreadDesc_E1_N_Ho_Wo_E2
{}.
CalculateOffset
(
b_origin_idx
+
make_tuple
(
e1
,
0
,
h
+
1
,
w
,
e2
));
constexpr
index_t
b3_offset
=
BThreadDesc_E1_N_Ho_Wo_E2
{}.
CalculateOffset
(
b_origin_idx
+
make_tuple
(
e1
,
0
,
h
+
1
,
w
+
1
,
e2
));
constexpr
index_t
c0_offset
=
CThreadDesc_K_N_Ho_Wo
{}.
CalculateOffset
(
c_origin_idx
+
make_tuple
(
k
,
0
,
h
,
w
));
constexpr
index_t
c1_offset
=
CThreadDesc_K_N_Ho_Wo
{}.
CalculateOffset
(
c_origin_idx
+
make_tuple
(
k
,
0
,
h
,
w
+
1
));
constexpr
index_t
c2_offset
=
CThreadDesc_K_N_Ho_Wo
{}.
CalculateOffset
(
c_origin_idx
+
make_tuple
(
k
,
0
,
h
+
1
,
w
));
constexpr
index_t
c3_offset
=
CThreadDesc_K_N_Ho_Wo
{}.
CalculateOffset
(
c_origin_idx
+
make_tuple
(
k
,
0
,
h
+
1
,
w
+
1
));
amd_assembly_outer_product_1x4
(
a_buf
[
Number
<
a_offset
>
{}],
b_buf
[
Number
<
b0_offset
>
{}],
b_buf
[
Number
<
b1_offset
>
{}],
b_buf
[
Number
<
b2_offset
>
{}],
b_buf
[
Number
<
b3_offset
>
{}],
c_buf
(
Number
<
c0_offset
>
{}),
c_buf
(
Number
<
c1_offset
>
{}),
c_buf
(
Number
<
c2_offset
>
{}),
c_buf
(
Number
<
c3_offset
>
{}));
});
});
});
});
});
});
#else
static_for
<
0
,
K
,
1
>
{}([
&
](
auto
k
)
{
static_for
<
0
,
Ho
,
1
>
{}([
&
](
auto
h
)
{
static_for
<
0
,
Wo
,
1
>
{}([
&
](
auto
w
)
{
static_for
<
0
,
E1
,
1
>
{}([
&
](
auto
e1
)
{
static_for
<
0
,
E2
,
1
>
{}([
&
](
auto
e2
)
{
constexpr
index_t
a_offset
=
AThreadDesc_E1_K_E2
{}.
CalculateOffset
(
a_origin_idx
+
make_tuple
(
e1
,
k
,
e2
));
constexpr
index_t
b_offset
=
BThreadDesc_E1_N_Ho_Wo_E2
{}.
CalculateOffset
(
b_origin_idx
+
make_tuple
(
e1
,
0
,
h
,
w
,
e2
));
constexpr
index_t
c_offset
=
CThreadDesc_K_N_Ho_Wo
{}.
CalculateOffset
(
c_origin_idx
+
make_tuple
(
k
,
0
,
h
,
w
));
inner_product
<
FloatA
,
FloatB
,
FloatC
>
(
a_buf
[
Number
<
a_offset
>
{}],
b_buf
[
Number
<
b_offset
>
{}],
c_buf
(
Number
<
c_offset
>
{}));
}
else
{
static_for
<
0
,
K
,
1
>
{}([
&
](
auto
k
)
{
static_for
<
0
,
Ho
,
1
>
{}([
&
](
auto
h
)
{
static_for
<
0
,
Wo
,
1
>
{}([
&
](
auto
w
)
{
static_for
<
0
,
E1
,
1
>
{}([
&
](
auto
e1
)
{
static_for
<
0
,
E2
,
1
>
{}([
&
](
auto
e2
)
{
constexpr
index_t
a_offset
=
AThreadDesc_E1_K_E2
{}.
CalculateOffset
(
a_origin_idx
+
make_tuple
(
e1
,
k
,
e2
));
constexpr
index_t
b_offset
=
BThreadDesc_E1_N_Ho_Wo_E2
{}.
CalculateOffset
(
b_origin_idx
+
make_tuple
(
e1
,
0
,
h
,
w
,
e2
));
constexpr
index_t
c_offset
=
CThreadDesc_K_N_Ho_Wo
{}.
CalculateOffset
(
c_origin_idx
+
make_tuple
(
k
,
0
,
h
,
w
));
inner_product
<
FloatA
,
FloatB
,
FloatC
>
(
a_buf
[
Number
<
a_offset
>
{}],
b_buf
[
Number
<
b_offset
>
{}],
c_buf
(
Number
<
c_offset
>
{}));
});
});
});
});
});
});
#endif
}
}
};
...
...
host/driver_offline/include/device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
View file @
40694062
...
...
@@ -122,9 +122,9 @@ void device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw(
constexpr
index_t
HoPerBlock
=
8
;
constexpr
index_t
WoPerBlock
=
32
;
constexpr
index_t
E1
=
C0
*
9
;
constexpr
index_t
E1
=
2
*
9
;
constexpr
index_t
E2
=
1
;
constexpr
index_t
EPerBlock
=
C0
;
constexpr
index_t
EPerBlock
=
2
;
constexpr
index_t
KPerThread
=
16
;
constexpr
index_t
HoPerThread
=
2
;
...
...
host/driver_offline/include/driver_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp
View file @
40694062
...
...
@@ -77,8 +77,11 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
const
auto
ConvDilationH
=
conv_dilations
[
I0
];
const
auto
ConvDilationW
=
conv_dilations
[
I1
];
const
auto
Hop
=
Number
<
(
Ho
+
HoPerBlock
-
1
)
/
HoPerBlock
*
HoPerBlock
>
{};
const
auto
Wop
=
Number
<
(
Wo
+
WoPerBlock
-
1
)
/
WoPerBlock
*
WoPerBlock
>
{};
// const auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{};
// const auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{};
const
auto
Hop
=
(
Ho
+
HoPerBlock
-
1
)
/
HoPerBlock
*
HoPerBlock
;
const
auto
Wop
=
(
Wo
+
WoPerBlock
-
1
)
/
WoPerBlock
*
WoPerBlock
;
const
auto
OutRightPadH
=
Hop
-
Ho
;
const
auto
OutRightPadW
=
Wop
-
Wo
;
...
...
@@ -226,9 +229,9 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}));
static_assert
(
a_e0_e1_k_e2_grid_desc
.
IsKnownAtCompileTime
(),
""
);
static_assert
(
b_e0_e1_n_ho_wo_e2_grid_desc
.
IsKnownAtCompileTime
(),
""
);
static_assert
(
c_k_n_hop_wop_grid_desc
.
IsKnownAtCompileTime
(),
""
);
//
static_assert(a_e0_e1_k_e2_grid_desc.IsKnownAtCompileTime(), "");
//
static_assert(b_e0_e1_n_ho_wo_e2_grid_desc.IsKnownAtCompileTime(), "");
//
static_assert(c_k_n_hop_wop_grid_desc.IsKnownAtCompileTime(), "");
// GEMM
using
GridwiseGemm
=
GridwiseGemmDlops_km_kn_mn_v3
<
...
...
@@ -279,19 +282,12 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
const
auto
grid_size
=
(
K
/
KPerBlock
)
*
(
Hop
/
HoPerBlock
)
*
(
Wop
/
WoPerBlock
)
*
N
;
const
bool
has_main_e0_block_loop
=
E0
>
1
;
const
bool
has_main_e1_block_loop
=
(
E1
+
E1PerBlock
)
/
(
2
*
E1PerBlock
)
>
1
;
const
bool
has_double_tail_e1_block_loop
=
(
E1
/
E1PerBlock
)
%
2
==
0
;
const
bool
has_main_e0_block_loop
=
E0
>
1
;
std
::
cerr
<<
"has_main_e0_block_loop = "
<<
has_main_e0_block_loop
<<
"has_main_e1_block_loop = "
<<
has_main_e1_block_loop
<<
" has_double_tail_e1_block_loop = "
<<
has_double_tail_e1_block_loop
<<
std
::
endl
;
std
::
cerr
<<
"has_main_e0_block_loop = "
<<
has_main_e0_block_loop
<<
std
::
endl
;
const
auto
c_blockid_to_k_n_ho_wo_block_cluster_adaptor
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_merge_transform
(
make_tuple
(
I0
,
I0
))),
make_tuple
(
Sequence
<
0
,
1
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
GridwiseGemm
::
MakeCBlockIdToKNHoWoBlockClusterAdaptor
(
c_k_n_hop_wop_grid_desc
);
using
CBlockIdToBlockClusterAdaptor_K_N_Ho_Wo
=
decltype
(
c_blockid_to_k_n_ho_wo_block_cluster_adaptor
);
...
...
@@ -337,35 +333,69 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nchw_kcyx_nkhw_outp
c_blockid_to_k_n_ho_wo_block_cluster_adaptor_dev_buf
.
ToDevice
(
&
c_blockid_to_k_n_ho_wo_block_cluster_adaptor
);
const
auto
kernel
=
kernel_gemm_dlops_v2
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AGridDesc_E0_E1_K_E2
>
,
remove_reference_t
<
BGridDesc_E0_E1_N_Ho_Wo_E2
>
,
remove_reference_t
<
CGridDesc_K_N_Ho_Wo
>
,
remove_reference_t
<
CBlockIdToBlockClusterAdaptor_K_N_Ho_Wo
>
,
has_main_e0_block_loop
,
has_main_e1_block_loop
,
has_double_tail_e1_block_loop
>
;
if
(
has_main_e0_block_loop
)
{
const
auto
kernel
=
kernel_gemm_dlops_v2
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AGridDesc_E0_E1_K_E2
>
,
remove_reference_t
<
BGridDesc_E0_E1_N_Ho_Wo_E2
>
,
remove_reference_t
<
CGridDesc_K_N_Ho_Wo
>
,
remove_reference_t
<
CBlockIdToBlockClusterAdaptor_K_N_Ho_Wo
>
,
true
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
p_a_grid
,
p_b_grid
,
p_c_grid
,
cast_pointer_to_constant_address_space
(
a_e0_e1_k_e2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
b_e0_e1_n_ho_wo_e2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_k_n_hop_wop_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_blockid_to_k_n_ho_wo_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
()));
}
else
{
const
auto
kernel
=
kernel_gemm_dlops_v2
<
GridwiseGemm
,
FloatAB
,
FloatC
,
remove_reference_t
<
AGridDesc_E0_E1_K_E2
>
,
remove_reference_t
<
BGridDesc_E0_E1_N_Ho_Wo_E2
>
,
remove_reference_t
<
CGridDesc_K_N_Ho_Wo
>
,
remove_reference_t
<
CBlockIdToBlockClusterAdaptor_K_N_Ho_Wo
>
,
false
>
;
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
p_a_grid
,
p_b_grid
,
p_c_grid
,
cast_pointer_to_constant_address_space
(
a_e0_e1_k_e2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
b_e0_e1_n_ho_wo_e2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_k_n_hop_wop_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_blockid_to_k_n_ho_wo_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
()));
}
ave_time
=
launch_and_time_kernel
(
kernel
,
nrepeat
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
p_a_grid
,
p_b_grid
,
p_c_grid
,
cast_pointer_to_constant_address_space
(
a_e0_e1_k_e2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
b_e0_e1_n_ho_wo_e2_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_k_n_hop_wop_grid_desc_dev_buf
.
GetDeviceBuffer
()),
cast_pointer_to_constant_address_space
(
c_blockid_to_k_n_ho_wo_block_cluster_adaptor_dev_buf
.
GetDeviceBuffer
()));
#endif
return
ave_time
;
}
...
...
host/driver_offline/src/conv_fwd_driver_offline.cpp
View file @
40694062
...
...
@@ -20,7 +20,7 @@
#include "device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
#include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
#define USE_DYNAMIC_MODE
0
#define USE_DYNAMIC_MODE
1
#define USE_CONV_FWD_V4R4_NCHW 0
#define USE_CONV_FWD_V4R4R2_NHWC 0
#define USE_CONV_FWD_V6R1_NCHW 0
...
...
@@ -52,6 +52,8 @@ int main(int argc, char* argv[])
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
I6
=
Number
<
6
>
{};
constexpr
index_t
activ_type
=
0
;
#if USE_DYNAMIC_MODE
// dynamic mode
if
(
argc
!=
22
)
...
...
@@ -105,16 +107,14 @@ int main(int argc, char* argv[])
const
bool
do_log
=
std
::
stoi
(
argv
[
5
]);
const
int
nrepeat
=
std
::
stoi
(
argv
[
6
]);
constexpr
index_t
activ_type
=
0
;
#if 1
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
C
=
Number
<
16
>
{};
constexpr
auto
Hi
=
Number
<
1080
>
{};
constexpr
auto
Wi
=
Number
<
1920
>
{};
constexpr
auto
K
=
Number
<
64
>
{};
constexpr
auto
Y
=
Number
<
3
>
{};
constexpr
auto
X
=
Number
<
3
>
{};
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
C
=
Number
<
16
>
{};
constexpr
auto
Hi
=
Number
<
1080
>
{};
constexpr
auto
Wi
=
Number
<
1920
>
{};
constexpr
auto
K
=
Number
<
64
>
{};
constexpr
auto
Y
=
Number
<
3
>
{};
constexpr
auto
X
=
Number
<
3
>
{};
#elif 0
constexpr
auto
N
=
Number
<
1
>
{};
constexpr
auto
C
=
Number
<
16
>
{};
...
...
script/run.sh
View file @
40694062
...
...
@@ -29,7 +29,7 @@ INIT=$4
LOG
=
$5
REPEAT
=
$6
./host/driver_offline/conv_fwd_driver_offline
$LAYOUT
$ALGO
$VERIFY
$INIT
$LOG
$REPEAT
#
./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT
################################################ layout algo verify init log repeat N__ K___ C___ Y X Hi_ Wi__ Strides Dilations LeftPads RightPads
#./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 128 192 3 3 71 71 2 2 1 1 1 1 1 1
...
...
@@ -53,7 +53,7 @@ REPEAT=$6
#./host/driver_online/conv_fwd_driver_online $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 128 128 192 3 3 71 71 2 2 1 1 1 1 1 1
#
./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 1 16 16 3 3 1080 1920 1 1 1 1 1 1 1 1
./host/driver_offline/conv_fwd_driver_offline
$LAYOUT
$ALGO
$VERIFY
$INIT
$LOG
$REPEAT
1 16 16 3 3 1080 1920 1 1 1 1 1 1 1 1
#./host/driver_offline/conv_fwd_driver_offline $LAYOUT $ALGO $VERIFY $INIT $LOG $REPEAT 1 16 16 1 1 16 16 1 1 1 1 0 0 0 0
################################################ layout algo verify init log repeat M___ N___ K___
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment