Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
415b4cbd
Commit
415b4cbd
authored
Mar 15, 2021
by
root
Browse files
debugging input load
parent
91ef99a7
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
46 additions
and
32 deletions
+46
-32
composable_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
...ble_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
+18
-5
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
...nel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
+24
-23
driver/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+4
-4
No files found.
composable_kernel/include/tensor_operation/blockwise_gemm_v3.hpp
View file @
415b4cbd
...
...
@@ -76,7 +76,18 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
__device__
static
MatrixIndex
GetBeginOfThreadMatrixC
(
index_t
thread_id
)
{
return
MatrixIndex
{
1
,
8
,
8
};
constexpr
index_t
H
=
BlockMatrixB
{}.
GetLength
(
Number
<
2
>
{});
constexpr
index_t
W
=
BlockMatrixB
{}.
GetLength
(
Number
<
3
>
{});
constexpr
auto
num_w_threads
=
W
/
WPerThread
;
constexpr
auto
num_h_threads
=
H
/
HPerThread
;
index_t
k_thread_id
=
thread_id
/
(
num_w_threads
*
num_h_threads
);
index_t
h_thread_id
=
thread_id
/
num_w_threads
;
index_t
w_thread_id
=
thread_id
%
num_w_threads
;
return
MatrixIndex
{
k_thread_id
,
h_thread_id
,
w_thread_id
};
}
template
<
typename
SrcDesc
,
...
...
@@ -127,10 +138,13 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
make_tuple
(
Number
<
CYXPerThreadLoop
>
{},
Number
<
KPerThread
>
{}));
constexpr
auto
b_thread_mtx
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
// make_tuple(Number<CYXPerThreadLoop>{}, Number<1>{}, Number<1>{}, Number<1>{}));
// make_tuple(Number<CYXPerThreadLoop>{}, Number<1>{}, Number<HPerThread>{},
// Number<WPerThread>{}));
make_tuple
(
Number
<
CYXPerThreadLoop
>
{},
Number
<
1
>
{}));
constexpr
auto
c_thread_mtx
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
// make_tuple(Number<KPerThread>{}, Number<1>{},
// Number<HPerThread>{}, Number<WPerThread>{}));
make_tuple
(
Number
<
KPerThread
>
{},
Number
<
1
>
{}));
FloatA
p_a_thread
[
a_thread_mtx
.
GetElementSpaceSize
()];
...
...
@@ -147,15 +161,14 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
// loop over k
for
(
index_t
cyx_begin
=
0
;
cyx_begin
<
CYXPerBlock
;
cyx_begin
+=
CYXPerThreadLoop
)
{
#if
0
#if
1
a_thread_copy
.
Run
(
p_a_block
+
a_block_mtx
.
CalculateOffset
(
make_tuple
(
cyx_begin
,
0
))
+
mMyThreadOffsetA
,
p_a_thread
+ a_thread_mtx.CalculateOffset(make_tuple(0, 0))
);
p_a_thread
);
#else
for
(
index_t
i
=
0
;
i
<
a_thread_mtx
.
GetElementSpaceSize
();
i
++
)
p_a_thread
[
i
]
=
1
;
#endif
threadwise_gemm
.
Run
(
p_a_thread
,
p_b_thread
+
cyx_begin
,
p_c_thread
);
}
}
...
...
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
View file @
415b4cbd
...
...
@@ -95,32 +95,32 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
// divide block work by [M, N]
#if 1
const
auto
m_block_work_num
=
K
/
Number
<
KPerBlock
>
{};
const
auto
n
hw_block_work_num
=
(
N
*
H
*
W
)
/
(
Number
<
HPerBlock
>
{}
*
Number
<
WPerBlock
>
{});
const
auto
hw_block_work_num
=
(
N
*
H
*
W
)
/
(
Number
<
HPerBlock
>
{}
*
Number
<
WPerBlock
>
{});
const
index_t
k_block_work_id
=
get_block_1d_id
()
/
n
hw_block_work_num
;
const
index_t
n
hw_block_work_id
=
get_block_1d_id
()
-
k_block_work_id
*
n
hw_block_work_num
;
const
index_t
k_block_work_id
=
get_block_1d_id
()
/
hw_block_work_num
;
const
index_t
hw_block_work_id
=
get_block_1d_id
()
-
k_block_work_id
*
hw_block_work_num
;
constexpr
auto
h_num_threads
=
HPerBlock
/
HPerThread
;
constexpr
auto
w_num_threads
=
WPerBlock
/
WPerThread
;
static_assert
(
KPerBlock
==
KPerThread
,
""
);
const
auto
h_thread_id
=
get_thread_local_1d_id
()
/
h
_num_threads
;
const
auto
h_thread_id
=
get_thread_local_1d_id
()
/
w
_num_threads
;
const
auto
w_thread_id
=
get_thread_local_1d_id
()
%
w_num_threads
;
#else
// Hack: this force result into SGPR
const
index_t
m_block_work_num
=
__builtin_amdgcn_readfirstlane
(
K
/
KPerBlock
);
const
index_t
n
hw_block_work_num
=
__builtin_amdgcn_readfirstlane
(
N
/
HWPerBlock
);
const
index_t
hw_block_work_num
=
__builtin_amdgcn_readfirstlane
(
N
/
HWPerBlock
);
const
index_t
k_block_work_id
=
__builtin_amdgcn_readfirstlane
(
get_block_1d_id
()
/
n
hw_block_work_num
);
const
index_t
n
hw_block_work_id
=
get_block_1d_id
()
-
k_block_work_id
*
n
hw_block_work_num
;
__builtin_amdgcn_readfirstlane
(
get_block_1d_id
()
/
hw_block_work_num
);
const
index_t
hw_block_work_id
=
get_block_1d_id
()
-
k_block_work_id
*
hw_block_work_num
;
#endif
const
index_t
m_block_data_on_global
=
k_block_work_id
*
KPerBlock
;
const
index_t
h_block_data_on_global
=
n
hw_block_work_id
*
HPerBlock
;
const
index_t
w_block_data_on_global
=
n
hw_block_work_id
*
WPerBlock
;
const
index_t
h_block_data_on_global
=
hw_block_work_id
*
HPerBlock
;
const
index_t
w_block_data_on_global
=
hw_block_work_id
*
WPerBlock
;
// lds max alignment
constexpr
auto
max_lds_align
=
...
...
@@ -166,7 +166,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
a_cyx_k_block_desc
,
make_multi_index
(
0
,
0
));
#if 1
constexpr
auto
b_cyx_n_h_w_thread_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
Number
<
CYXPerThread
>
{},
Number
<
1
>
{},
Number
<
HPerThread
>
{},
Number
<
WPerThread
>
{}));
...
...
@@ -191,7 +190,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
make_multi_index
(
0
,
0
,
h_block_data_on_global
+
h_thread_id
,
w_block_data_on_global
+
w_thread_id
));
#endif
// c_thread_mtx definition: this is a mess
// TODO:: more elegent way of defining c_thread_mtx
constexpr
auto
c_k_n_h_w_thread_desc
=
...
...
@@ -264,20 +262,20 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
a_blockwise_copy
.
RunWrite
(
a_cyx_k_block_desc
,
p_a_block_double
);
#if
0
#if
1
__syncthreads
();
//blockwise_gemm.Run(p_a_block_double, p_b_thread_double, p_c_thread);
index_t
sum
=
0
;
for
(
index_t
i
=
0
;
i
<
b_cyx_n_h_w_thread_desc
.
GetElementSpaceSize
();
i
++
)
sum += p_b_thread[i];
sum
+=
p_b_thread
_double
[
i
];
p_c_thread[0] = get_thread_local_1d_id() * 10000 + sum;
p_c_thread
[
0
]
+=
p_b_thread_double
[
0
]
+
p_b_thread_double
[
1
]
+
p_b_thread_double
[
2
];
p_c_thread
[
0
]
+=
p_b_thread_double
[
3
]
+
p_b_thread_double
[
4
]
+
p_b_thread_double
[
5
];
p_c_thread
[
0
]
+=
p_b_thread_double
[
6
]
+
p_b_thread_double
[
7
]
+
p_b_thread_double
[
8
];
#endif
}
#if
1
#if
0
if constexpr(HasMainKBlockLoop)
{
Float* p_a_block_even = p_a_block_double;
...
...
@@ -398,8 +396,9 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
{
// define input tensor descriptor for threadwise copy
// thread input tensor, src of threadwise copy
constexpr
auto
c_k_n_h_w_thread_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
Number
<
KPerThread
>
{},
Number
<
1
>
{},
Number
<
1
>
{},
Number
<
1
>
{}));
constexpr
auto
c_k_n_h_w_thread_desc
=
make_dynamic_naive_tensor_descriptor_packed_v2
(
make_tuple
(
Number
<
KPerThread
>
{},
Number
<
1
>
{},
Number
<
HPerThread
>
{},
Number
<
WPerThread
>
{}));
// calculate origin of thread input tensor on global memory
// blockwise GEMM c matrix starting index
...
...
@@ -414,8 +413,10 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
n_block_data_on_global + c_thread_mtx_on_block.col;
#endif
const
index_t
m_thread_data_on_global
=
m_block_data_on_global
;
const
index_t
h_thread_data_on_global
=
h_block_data_on_global
+
h_thread_id
;
const
index_t
w_thread_data_on_global
=
w_block_data_on_global
+
w_thread_id
;
const
index_t
h_thread_data_on_global
=
h_block_data_on_global
+
h_thread_id
*
HPerThread
;
const
index_t
w_thread_data_on_global
=
w_block_data_on_global
+
w_thread_id
*
WPerThread
;
// hack to control index calculation when iterating over c_k_n_h_w_global tensor
constexpr
auto
c_k_n_h_w_global_tensor_iterator_hacks
=
CGlobalIteratorHacks
{};
...
...
driver/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
View file @
415b4cbd
...
...
@@ -73,20 +73,20 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc
constexpr
index_t
KPerBlock
=
16
;
constexpr
index_t
HPerBlock
=
8
;
constexpr
index_t
WPerBlock
=
8
;
constexpr
index_t
CYXPerBlock
=
4
;
constexpr
index_t
CYXPerBlock
=
4
*
3
*
3
;
constexpr
index_t
KPerThread
=
16
;
constexpr
index_t
HPerThread
=
1
;
constexpr
index_t
WPerThread
=
1
;
constexpr
index_t
CYXPerThread
=
4
;
constexpr
index_t
CYXPerThread
=
3
*
3
;
using
GemmABlockTransferThreadSliceLengths_GemmK_GemmM
=
Sequence
<
1
,
1
>
;
using
GemmABlockTransferThreadSliceLengths_GemmK_GemmM
=
Sequence
<
9
,
1
>
;
using
GemmABlockTransferThreadClusterLengths_GemmK_GemmM
=
Sequence
<
4
,
16
>
;
constexpr
index_t
GemmABlockTransferSrcScalarPerVector_GemmK
=
1
;
constexpr
index_t
GemmABlockTransferDstScalarPerVector_GemmM
=
1
;
using
GemmBBlockTransferThreadSliceLengths_GemmK_GemmN
=
Sequence
<
4
,
1
>
;
using
GemmBBlockTransferThreadSliceLengths_GemmK_GemmN
=
Sequence
<
36
,
1
>
;
using
GemmBBlockTransferThreadClusterLengths_GemmK_GemmN
=
Sequence
<
1
,
64
>
;
constexpr
index_t
GemmBBlockTransferSrcScalarPerVector_GemmN
=
1
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment