Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
2049ab57
Commit
2049ab57
authored
Feb 01, 2021
by
Jing Zhang
Browse files
fixed output
parent
a3edbec9
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
1 addition
and
34 deletions
+1
-34
composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_fp16_bfp16.hpp
...lude/tensor_operation/gridwise_gemm_xdlops_fp16_bfp16.hpp
+1
-34
No files found.
composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_fp16_bfp16.hpp
View file @
2049ab57
...
...
@@ -587,7 +587,6 @@ struct GridwiseBatchGemmXdlops_gkmkpack_gknkpack_gmn_v2
constexpr
index_t
BlkSize
=
blockwise_gemm
.
GetBlkSize
();
constexpr
index_t
NumBlks
=
blockwise_gemm
.
GetNumBlks
();
#if 1
// force unrolling the output loop to get ride of scratches
static_for
<
0
,
NumBlks
,
1
>
{}([
&
](
auto
blk_id
)
{
// calculate origin of thread output tensor on global memory
...
...
@@ -618,40 +617,8 @@ struct GridwiseBatchGemmXdlops_gkmkpack_gknkpack_gmn_v2
m_thread_data_on_global
%
(
M2
*
M1
)
/
M2
,
m_thread_data_on_global
%
M2
,
n_thread_data_on_global
))
.
Store
(
c_thread_vec
,
p_c_global
);
.
Store
(
c_thread_vec
.
GetVector
(
Number
<
BlkSize
>
{})[
Number
<
blk_id
>
{}]
,
p_c_global
);
});
#else
for
(
index_t
i
=
0
;
i
<
NumBlks
;
++
i
)
{
// calculate origin of thread output tensor on global memory
// blockwise GEMM c matrix starting index
const
auto
c_thread_mtx_on_block
=
blockwise_gemm
.
GetBeginOfThreadMatrixC
(
i
);
const
index_t
m_thread_data_on_global
=
m_block_data_on_global
+
c_thread_mtx_on_block
.
row
;
const
index_t
n_thread_data_on_global
=
n_block_data_on_global
+
c_thread_mtx_on_block
.
col
;
ThreadwiseGenericTensorSliceCopy_v4r2
<
decltype
(
c_g_m0_m1_m2_n_thread_desc
),
decltype
(
c_g_m0_m1_m2_n_global_desc
),
CThreadCopySliceLengths
,
arithmetic_sequence_gen
<
0
,
5
,
1
>::
type
,
4
,
1
,
1
,
AddressSpace
::
Vgpr
,
AddressSpace
::
Global
,
CGlobalMemoryOp
>
(
make_multi_index
(
0
,
0
,
0
,
0
,
0
),
make_multi_index
(
g_block_data_on_global
,
m_thread_data_on_global
/
(
M2
*
M1
),
m_thread_data_on_global
%
(
M2
*
M1
)
/
M2
,
m_thread_data_on_global
%
M2
,
n_thread_data_on_global
))
.
Run
(
c_thread_vec
.
n
+
i
*
BlkSize
,
p_c_global
);
}
#endif
}
}
};
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment