Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
b753bbc5
Commit
b753bbc5
authored
May 30, 2023
by
carlushuang
Browse files
fix a bug in final reduce
parent
d19487eb
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
28 additions
and
21 deletions
+28
-21
include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_streamk.hpp
...sor_operation/gpu/device/impl/device_gemm_xdl_streamk.hpp
+3
-4
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp
...ensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp
+25
-17
No files found.
include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_streamk.hpp
View file @
b753bbc5
...
...
@@ -168,10 +168,9 @@ struct DeviceGemmXdlStreamK : public DeviceGemmStreamK<ALayout,
else
if
constexpr
(
GridwiseGemm
::
Block2CTileMap
::
ReductionStrategy
==
StreamKReductionStrategy
::
Reduction
)
{
char
*
workspace_semaphore
=
reinterpret_cast
<
char
*>
(
karg
.
p_workspace_
);
workspace_semaphore
=
workspace_semaphore
+
karg
.
block_mapping
.
get_workspace_size_for_acc
(
sizeof
(
typename
GridwiseGemm
::
FloatAcc
));
char
*
workspace_semaphore
=
reinterpret_cast
<
char
*>
(
karg
.
p_workspace_
)
+
karg
.
block_mapping
.
get_workspace_size_for_acc
(
sizeof
(
typename
GridwiseGemm
::
FloatAcc
));
auto
preprocess
=
[
&
]()
{
hipGetErrorString
(
hipMemset
(
workspace_semaphore
,
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_streamk.hpp
View file @
b753bbc5
...
...
@@ -562,7 +562,6 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk
constexpr
auto
acc_thread_buf_load_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
I1
,
Number
<
CBlockTransferScalarPerVector_NWaveNPerXDL
>
{}));
constexpr
auto
acc_thread_buf_store_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
I1
,
I1
,
I1
,
Number
<
CBlockTransferScalarPerVector_NWaveNPerXDL
>
{}));
...
...
@@ -572,7 +571,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk
0
,
cluster_length_reduce
.
At
(
I1
)
*
CBlockTransferScalarPerVector_NWaveNPerXDL
);
constexpr
auto
partial_acc_load_step_n_reverse
=
make_multi_index
(
0
,
-
1
*
(
MReduceIters
-
1
)
*
cluster_length_reduce
.
At
(
I1
)
*
-
1
*
cluster_length_reduce
.
At
(
I1
)
.
value
*
(
NReduceIters
-
1
)
*
CBlockTransferScalarPerVector_NWaveNPerXDL
);
constexpr
auto
partial_acc_load_step_m
=
make_multi_index
(
cluster_length_reduce
.
At
(
I0
),
0
);
...
...
@@ -653,17 +652,22 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk
// block synchronization
wg_barrier
.
wait_eq
(
reduction_idx
,
tile_acc_offset_end
-
tile_acc_offset_start
);
// if(threadIdx.x == 0) {
#if 0
if(threadIdx.x == 0) {
// if(reduction_idx == 0){
// printf("(miter:%d, niter:%d, cluster red:%d,%d)bid:%d, rid:%d, os:%d-%d(%d),
// spatial:%d-%d, tid:%d, %d, %d\n",
// MReduceIters(), NReduceIters(), cluster_length_reduce.At(I0).value,
// cluster_length_reduce.At(I1).value, static_cast<int>(blockIdx.x),
// reduction_idx, tile_acc_offset_start, tile_acc_offset_end,
// tile_acc_offset_end - tile_acc_offset_start, spatial_idx[I0],
// spatial_idx[I1], static_cast<int>(threadIdx.x), thread_m_cluster_id,
// thread_n_cluster_id);
// }
// printf("(cluster red:%d,%d)bid:%d, rid:%d, os:%d-%d(%d), spatial:%d-%d, tid:%d, %d, %d\n",
// cluster_length_reduce.At(I0).value,
// cluster_length_reduce.At(I1).value, static_cast<int>(blockIdx.x),
// reduction_idx, tile_acc_offset_start, tile_acc_offset_end,
// tile_acc_offset_end - tile_acc_offset_start, spatial_idx[I0],
// spatial_idx[I1], static_cast<int>(threadIdx.x), thread_m_cluster_id,
// thread_n_cluster_id);
printf("bid:%d, rid:%d, os:%d,%d, spatial:%d,%d\n", static_cast<int>(blockIdx.x),
reduction_idx, __builtin_amdgcn_readfirstlane(tile_acc_offset_start), __builtin_amdgcn_readfirstlane(tile_acc_offset_end),
__builtin_amdgcn_readfirstlane(spatial_idx[I0]),
__builtin_amdgcn_readfirstlane(spatial_idx[I1]));
}
#endif
using
Accumulation
=
ck
::
detail
::
AccumulateWithNanCheck
<
false
/*PropagateNan*/
,
reduce
::
Add
,
FloatAcc
>
;
...
...
@@ -698,11 +702,15 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk
});
}
acc_store
.
Run
(
acc_thread_buf_store_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
acc_buf
,
c_grid_desc_mblock_mperblock_nblock_nperblock
,
c_grid_buf
);
if
(
thread_n_cluster_id
*
CBlockTransferScalarPerVector_NWaveNPerXDL
<
NPerBlock
)
{
acc_store
.
Run
(
acc_thread_buf_store_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
acc_buf
,
c_grid_desc_mblock_mperblock_nblock_nperblock
,
c_grid_buf
);
}
if
constexpr
(
NReduceIters
!=
1
)
{
if
constexpr
(
i_n_reduce
!=
(
NReduceIters
-
1
))
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment