Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
88d51698
"...git@developer.sourcefind.cn:OpenDAS/mmdetection3d.git" did not exist on "6356cbdc4f652aa6004b08b0fc05f6a4ec9d0b97"
Commit
88d51698
authored
Mar 14, 2021
by
root
Browse files
debug
parent
758f576a
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
51 additions
and
39 deletions
+51
-39
composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+1
-1
composable_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
...ble_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
+9
-2
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
...nel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
+38
-33
driver/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+3
-3
No files found.
composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
View file @
88d51698
...
@@ -209,7 +209,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
...
@@ -209,7 +209,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad
decltype
(
a_k_m_global_move_slice_window_iterator_hack
),
decltype
(
a_k_m_global_move_slice_window_iterator_hack
),
decltype
(
b_k_n_global_move_slice_window_iterator_hack
)
>
;
decltype
(
b_k_n_global_move_slice_window_iterator_hack
)
>
;
const
auto
GridSize
=
(
K
/
KPerBlock
)
*
(
Ho
/
HPerBlock
)
*
(
Wo
/
WPerBlock
);
const
auto
GridSize
=
(
K
/
KPerBlock
)
*
(
Ho
/
HPerBlock
)
*
(
Wo
/
WPerBlock
)
*
N
;
const
bool
has_main_k_block_loop
=
(
CYX
+
CYXPerBlock
)
/
(
2
*
CYXPerBlock
)
>
1
;
const
bool
has_main_k_block_loop
=
(
CYX
+
CYXPerBlock
)
/
(
2
*
CYXPerBlock
)
>
1
;
...
...
composable_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
View file @
88d51698
...
@@ -110,16 +110,18 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
...
@@ -110,16 +110,18 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
__device__
void
__device__
void
Run_naive
(
const
FloatA
*
p_a_block
,
const
FloatB
*
p_b_thread
,
FloatC
*
p_c_thread
)
const
Run_naive
(
const
FloatA
*
p_a_block
,
const
FloatB
*
p_b_thread
,
FloatC
*
p_c_thread
)
const
{
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
a_block_mtx
=
BlockMatrixA
{};
constexpr
auto
a_block_mtx
=
BlockMatrixA
{};
constexpr
auto
b_block_mtx
=
BlockMatrixB
{};
constexpr
auto
CYXPerBlock
=
a_block_mtx
.
GetLength
(
I0
);
constexpr
auto
CYXPerBlock
=
a_block_mtx
.
GetLength
(
I0
);
static_assert
(
CYXPerBlock
==
CYXPerThreadLoop
,
""
);
// thread A, B for GEMM
// thread A, B for GEMM
constexpr
auto
a_thread_mtx
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
constexpr
auto
a_thread_mtx
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
Number
<
CYXPerThreadLoop
>
{},
Number
<
KPerThread
>
{}));
make_tuple
(
Number
<
CYXPerThreadLoop
>
{},
Number
<
KPerThread
>
{}));
...
@@ -145,11 +147,16 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
...
@@ -145,11 +147,16 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
// loop over k
// loop over k
for
(
index_t
cyx_begin
=
0
;
cyx_begin
<
CYXPerBlock
;
cyx_begin
+=
CYXPerThreadLoop
)
for
(
index_t
cyx_begin
=
0
;
cyx_begin
<
CYXPerBlock
;
cyx_begin
+=
CYXPerThreadLoop
)
{
{
#if 1
a_thread_copy
.
Run
(
p_a_block
+
a_block_mtx
.
CalculateOffset
(
make_tuple
(
cyx_begin
,
0
))
+
a_thread_copy
.
Run
(
p_a_block
+
a_block_mtx
.
CalculateOffset
(
make_tuple
(
cyx_begin
,
0
))
+
mMyThreadOffsetA
,
mMyThreadOffsetA
,
p_a_thread
+
a_thread_mtx
.
CalculateOffset
(
make_tuple
(
0
,
0
)));
p_a_thread
+
a_thread_mtx
.
CalculateOffset
(
make_tuple
(
0
,
0
)));
#else
for
(
index_t
i
=
0
;
i
<
a_thread_mtx
.
GetElementSpaceSize
();
i
++
)
p_a_thread
[
i
]
=
1
;
#endif
threadwise_gemm
.
Run
(
p_a_thread
,
p_b_thread
+
CYXPerThreadLoop
*
cyx_begin
,
p_c_thread
);
threadwise_gemm
.
Run
(
p_a_thread
,
p_b_thread
+
cyx_begin
,
p_c_thread
);
}
}
}
}
...
...
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
View file @
88d51698
...
@@ -88,8 +88,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
...
@@ -88,8 +88,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
const
auto
CYX
=
a_cyx_k_global_desc
.
GetLength
(
I0
);
const
auto
CYX
=
a_cyx_k_global_desc
.
GetLength
(
I0
);
const
auto
K
=
a_cyx_k_global_desc
.
GetLength
(
I1
);
const
auto
K
=
a_cyx_k_global_desc
.
GetLength
(
I1
);
static_assert
(
CYX
==
4
*
3
*
3
&&
K
==
16
,
""
);
const
auto
N
=
b_cyx_n_h_w_global_desc
.
GetLength
(
I1
);
const
auto
N
=
b_cyx_n_h_w_global_desc
.
GetLength
(
I1
);
const
auto
H
=
b_cyx_n_h_w_global_desc
.
GetLength
(
I2
);
const
auto
H
=
b_cyx_n_h_w_global_desc
.
GetLength
(
I2
);
const
auto
W
=
b_cyx_n_h_w_global_desc
.
GetLength
(
I3
);
const
auto
W
=
b_cyx_n_h_w_global_desc
.
GetLength
(
I3
);
...
@@ -102,6 +100,13 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
...
@@ -102,6 +100,13 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
const
index_t
k_block_work_id
=
get_block_1d_id
()
/
nhw_block_work_num
;
const
index_t
k_block_work_id
=
get_block_1d_id
()
/
nhw_block_work_num
;
const
index_t
nhw_block_work_id
=
get_block_1d_id
()
-
k_block_work_id
*
nhw_block_work_num
;
const
index_t
nhw_block_work_id
=
get_block_1d_id
()
-
k_block_work_id
*
nhw_block_work_num
;
constexpr
auto
h_num_threads
=
HPerBlock
/
HPerThread
;
constexpr
auto
w_num_threads
=
WPerBlock
/
WPerThread
;
static_assert
(
KPerBlock
==
KPerThread
,
""
);
const
auto
h_thread_id
=
get_thread_local_1d_id
()
/
h_num_threads
;
const
auto
w_thread_id
=
get_thread_local_1d_id
()
%
w_num_threads
;
#else
#else
// Hack: this force result into SGPR
// Hack: this force result into SGPR
const
index_t
m_block_work_num
=
__builtin_amdgcn_readfirstlane
(
K
/
KPerBlock
);
const
index_t
m_block_work_num
=
__builtin_amdgcn_readfirstlane
(
K
/
KPerBlock
);
...
@@ -114,8 +119,8 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
...
@@ -114,8 +119,8 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
const
index_t
m_block_data_on_global
=
k_block_work_id
*
KPerBlock
;
const
index_t
m_block_data_on_global
=
k_block_work_id
*
KPerBlock
;
const
index_t
h_block_data_on_global
=
nhw_block_work_id
*
8
;
const
index_t
h_block_data_on_global
=
nhw_block_work_id
*
HPerBlock
;
const
index_t
w_block_data_on_global
=
nhw_block_work_id
*
8
;
const
index_t
w_block_data_on_global
=
nhw_block_work_id
*
WPerBlock
;
// lds max alignment
// lds max alignment
constexpr
auto
max_lds_align
=
constexpr
auto
max_lds_align
=
...
@@ -128,9 +133,9 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
...
@@ -128,9 +133,9 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
// B matrix in LDS memory, dst of blockwise copy
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
constexpr
auto
b_cyx_n_h_w_block_desc
=
make_dynamic_naive_tensor_descriptor_aligned_v2
(
constexpr
auto
b_cyx_n_h_w_block_desc
=
make_
tuple
(
Number
<
CYXPerBlock
>
{},
Number
<
1
>
{},
Number
<
8
>
{},
Number
<
8
>
{}),
make_
dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
max_lds_align
);
Number
<
CYXPerBlock
>
{},
Number
<
1
>
{},
Number
<
HPerBlock
>
{},
Number
<
WPerBlock
>
{})
);
// A matrix blockwise copy
// A matrix blockwise copy
auto
a_blockwise_copy
=
auto
a_blockwise_copy
=
...
@@ -162,18 +167,16 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
...
@@ -162,18 +167,16 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
make_multi_index
(
0
,
0
));
make_multi_index
(
0
,
0
));
#if 1
#if 1
constexpr
auto
b_cyx_n_h_w_thread_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
constexpr
auto
b_cyx_n_h_w_thread_desc
=
make_tuple
(
Number
<
CYXPerBlock
>
{},
Number
<
1
>
{},
Number
<
1
>
{},
Number
<
1
>
{}));
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
Number
<
CYXPerThread
>
{},
Number
<
1
>
{},
Number
<
HPerThread
>
{},
Number
<
WPerThread
>
{}));
const
index_t
h_thread_id
=
get_thread_local_1d_id
()
/
8
;
const
index_t
w_thread_id
=
get_thread_local_1d_id
()
%
8
;
using
ThreadwiseTensorSliceTransferB
=
ThreadwiseDynamicTensorSliceTransfer_v2
<
using
ThreadwiseTensorSliceTransferB
=
ThreadwiseDynamicTensorSliceTransfer_v2
<
Float
,
Float
,
Float
,
Float
,
decltype
(
b_cyx_n_h_w_global_desc
),
decltype
(
b_cyx_n_h_w_global_desc
),
decltype
(
b_cyx_n_h_w_thread_desc
),
decltype
(
b_cyx_n_h_w_thread_desc
),
Sequence
<
CYXPer
Block
,
1
,
1
,
1
>
,
Sequence
<
CYXPer
Thread
,
1
,
HPerThread
,
WPerThread
>
,
Sequence
<
3
,
2
,
0
,
1
>
,
// BBlockTransferSrcAccessOrder,
Sequence
<
3
,
2
,
0
,
1
>
,
// BBlockTransferSrcAccessOrder,
3
,
// BBlockTransferSrcVectorDim,
3
,
// BBlockTransferSrcVectorDim,
1
,
// BBlockTransferSrcScalarPerVector,
1
,
// BBlockTransferSrcScalarPerVector,
...
@@ -191,8 +194,9 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
...
@@ -191,8 +194,9 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
#endif
#endif
// c_thread_mtx definition: this is a mess
// c_thread_mtx definition: this is a mess
// TODO:: more elegent way of defining c_thread_mtx
// TODO:: more elegent way of defining c_thread_mtx
constexpr
auto
c_k_n_h_w_thread_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
constexpr
auto
c_k_n_h_w_thread_desc
=
make_tuple
(
Number
<
KPerThread
>
{},
Number
<
1
>
{},
Number
<
1
>
{},
Number
<
1
>
{}));
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
Number
<
KPerThread
>
{},
Number
<
1
>
{},
Number
<
HPerThread
>
{},
Number
<
WPerThread
>
{}));
#if 1
#if 1
const
auto
blockwise_gemm
=
const
auto
blockwise_gemm
=
...
@@ -200,14 +204,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
...
@@ -200,14 +204,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
decltype
(
a_cyx_k_block_desc
),
decltype
(
a_cyx_k_block_desc
),
decltype
(
b_cyx_n_h_w_block_desc
),
decltype
(
b_cyx_n_h_w_block_desc
),
decltype
(
c_k_n_h_w_thread_desc
),
decltype
(
c_k_n_h_w_thread_desc
),
16
,
// KPerThreadSubC
KPerThread
,
// KPerThreadSubC
1
,
// HPerThreadSubC
HPerThread
,
// HPerThreadSubC
1
,
// WPerThreadSubC
WPerThread
,
// WPerThreadSubC
1
,
// CYXPerThreadLoop
CYXPerThread
,
// CYXPerThreadLoop
8
,
// HThreadCluster
h_num_threads
,
// HThreadCluster
8
,
// WThreadCluster
w_num_threads
,
// WThreadCluster
1
,
// ThreadGemmADataPerRead_K
1
,
// ThreadGemmADataPerRead_K
1
// ThreadGemmBDataPerRead_W
1
// ThreadGemmBDataPerRead_W
>
{};
>
{};
#endif
#endif
...
@@ -232,7 +236,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
...
@@ -232,7 +236,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
// threadwise_matrix_set_zero_v2(c_k_n_h_w_thread_desc, p_c_thread);
// threadwise_matrix_set_zero_v2(c_k_n_h_w_thread_desc, p_c_thread);
constexpr
auto
a_block_slice_copy_step
=
make_multi_index
(
CYXPerBlock
,
0
);
constexpr
auto
a_block_slice_copy_step
=
make_multi_index
(
CYXPerBlock
,
0
);
constexpr
auto
b_block_slice_copy_step
=
make_multi_index
(
CYXPerBlock
,
0
,
0
,
0
);
//
constexpr auto b_block_slice_copy_step = make_multi_index(CYXPerBlock, 0, 0, 0);
// hack to control index calculation when iterating over A and B matrix for threadwise copy
// hack to control index calculation when iterating over A and B matrix for threadwise copy
constexpr
auto
a_k_m_global_iterator_hacks
=
AGlobalIteratorHacks
{};
constexpr
auto
a_k_m_global_iterator_hacks
=
AGlobalIteratorHacks
{};
...
@@ -245,14 +249,13 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
...
@@ -245,14 +249,13 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
constexpr
auto
b_cyx_n_h_w_global_move_slice_window_iterator_hack
=
constexpr
auto
b_cyx_n_h_w_global_move_slice_window_iterator_hack
=
BGlobalMoveSliceWindowIteratorHacks
{};
BGlobalMoveSliceWindowIteratorHacks
{};
Float
p_b_thread
[
b_cyx_n_h_w_thread_desc
.
GetElementSpaceSize
()];
// LDS double buffer: preload data into LDS
// LDS double buffer: preload data into LDS
{
{
a_blockwise_copy
.
RunRead
(
a_cyx_k_global_desc
,
p_a_global
,
a_k_m_global_iterator_hacks
);
a_blockwise_copy
.
RunRead
(
a_cyx_k_global_desc
,
p_a_global
,
a_k_m_global_iterator_hacks
);
constexpr
auto
b_thread_mtx
=
b_cyx_n_h_w_thread_desc
;
Float
p_b_thread
[
b_thread_mtx
.
GetElementSpaceSize
()];
b_threadwise_transfer
.
Run
(
b_cyx_n_h_w_global_desc
,
b_threadwise_transfer
.
Run
(
b_cyx_n_h_w_global_desc
,
p_b_global
,
p_b_global
,
b_cyx_n_h_w_thread_desc
,
b_cyx_n_h_w_thread_desc
,
...
@@ -264,14 +267,19 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
...
@@ -264,14 +267,19 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
__syncthreads
();
__syncthreads
();
blockwise_gemm
.
Run
(
p_a_block_double
,
p_b_thread
,
p_c_thread
);
//blockwise_gemm.Run(p_a_block_double, p_b_thread, p_c_thread);
index_t
sum
=
0
;
for
(
index_t
i
=
0
;
i
<
b_cyx_n_h_w_thread_desc
.
GetElementSpaceSize
();
i
++
)
sum
+=
p_b_thread
[
i
];
p_c_thread
[
0
]
=
get_thread_local_1d_id
()
*
10000
+
sum
;
}
}
#if 0
#if 0
if constexpr(HasMainKBlockLoop)
if constexpr(HasMainKBlockLoop)
{
{
Float* p_a_block_even = p_a_block_double;
Float* p_a_block_even = p_a_block_double;
Float* p_a_block_odd = p_a_block_double + a_block_space_size;
Float* p_a_block_odd = p_a_block_double + a_block_space_size;
index_t b_block_data_begin = 0;
index_t b_block_data_begin = 0;
...
@@ -371,9 +379,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
...
@@ -371,9 +379,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
const index_t n_thread_data_on_global =
const index_t n_thread_data_on_global =
n_block_data_on_global + c_thread_mtx_on_block.col;
n_block_data_on_global + c_thread_mtx_on_block.col;
#endif
#endif
const
index_t
h_thread_id
=
get_thread_local_1d_id
()
/
8
;
const
index_t
w_thread_id
=
get_thread_local_1d_id
()
%
8
;
const
index_t
m_thread_data_on_global
=
m_block_data_on_global
;
const
index_t
m_thread_data_on_global
=
m_block_data_on_global
;
const
index_t
h_thread_data_on_global
=
h_block_data_on_global
+
h_thread_id
;
const
index_t
h_thread_data_on_global
=
h_block_data_on_global
+
h_thread_id
;
const
index_t
w_thread_data_on_global
=
w_block_data_on_global
+
w_thread_id
;
const
index_t
w_thread_data_on_global
=
w_block_data_on_global
+
w_thread_id
;
...
...
driver/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
View file @
88d51698
...
@@ -78,7 +78,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc
...
@@ -78,7 +78,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc
constexpr
index_t
KPerThread
=
16
;
constexpr
index_t
KPerThread
=
16
;
constexpr
index_t
HPerThread
=
1
;
constexpr
index_t
HPerThread
=
1
;
constexpr
index_t
WPerThread
=
1
;
constexpr
index_t
WPerThread
=
1
;
constexpr
index_t
CYXPerThread
=
4
;
constexpr
index_t
CYXPerThread
=
4
*
3
*
3
;
using
GemmABlockTransferThreadSliceLengths_GemmK_GemmM
=
Sequence
<
9
,
1
>
;
using
GemmABlockTransferThreadSliceLengths_GemmK_GemmM
=
Sequence
<
9
,
1
>
;
using
GemmABlockTransferThreadClusterLengths_GemmK_GemmM
=
Sequence
<
4
,
16
>
;
using
GemmABlockTransferThreadClusterLengths_GemmK_GemmM
=
Sequence
<
4
,
16
>
;
...
@@ -86,8 +86,8 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc
...
@@ -86,8 +86,8 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc
constexpr
index_t
GemmABlockTransferSrcScalarPerVector_GemmK
=
1
;
constexpr
index_t
GemmABlockTransferSrcScalarPerVector_GemmK
=
1
;
constexpr
index_t
GemmABlockTransferDstScalarPerVector_GemmM
=
1
;
constexpr
index_t
GemmABlockTransferDstScalarPerVector_GemmM
=
1
;
using
GemmBBlockTransferThreadSliceLengths_GemmK_GemmN
=
Sequence
<
4
,
1
>
;
using
GemmBBlockTransferThreadSliceLengths_GemmK_GemmN
=
Sequence
<
36
,
1
>
;
using
GemmBBlockTransferThreadClusterLengths_GemmK_GemmN
=
Sequence
<
9
,
64
>
;
using
GemmBBlockTransferThreadClusterLengths_GemmK_GemmN
=
Sequence
<
1
,
64
>
;
constexpr
index_t
GemmBBlockTransferSrcScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmBBlockTransferSrcScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmBBlockTransferDstScalarPerVector_GemmN
=
1
;
constexpr
index_t
GemmBBlockTransferDstScalarPerVector_GemmN
=
1
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment