Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
999ac250
Commit
999ac250
authored
Jul 03, 2024
by
Adam Osewski
Browse files
Force vmem synchronization.
parent
277ad347
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
12 additions
and
3 deletions
+12
-3
include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_tile_loop.hpp
...grouped_gemm_multiple_d_splitk_xdl_cshuffle_tile_loop.hpp
+3
-0
include/ck/utility/work_scheduling.hpp
include/ck/utility/work_scheduling.hpp
+9
-3
No files found.
include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_tile_loop.hpp
View file @
999ac250
...
...
@@ -161,6 +161,9 @@ __global__ void
// do CShuffle in flight with loading partials products of other peer workgroups.
GridwiseGemm
::
StorePartials
(
p_workspace
,
static_cast
<
void
*>
(
p_shared
),
results_buffer
);
work_scheduler
.
FlagFinished
();
// When above would be done, then this will be necessary only when there are multiple
// WGPs reducing along K-dim.
work_scheduler
.
SynchronizeVmem
();
// The workgroup which processed first K tile accumulates results and stores to GMEM
if
(
b2c_tile_map
.
IsFirstKSplitBlock
())
...
...
include/ck/utility/work_scheduling.hpp
View file @
999ac250
...
...
@@ -39,7 +39,7 @@ class StridedReductionTileLoop
tile_id_
{
__builtin_amdgcn_readfirstlane
(
get_block_1d_id
()
*
tiles_per_block_
)},
block_tile_idx_
{
__builtin_amdgcn_readfirstlane
(
0
)},
finished_block_flags_
{
p_flags
},
is_sync_needed_
{
1
}
is_sync_needed_
{
true
}
{
}
...
...
@@ -179,7 +179,13 @@ class StridedReductionTileLoop
__device__
void
SetIsSyncNeeded
(
index_t
next_k_tiles
,
index_t
k_tiles
)
{
is_sync_needed_
=
__builtin_amdgcn_readfirstlane
(
next_k_tiles
==
k_tiles
?
0
:
1
);
is_sync_needed_
=
(
next_k_tiles
==
k_tiles
?
0
:
1
);
}
__device__
void
SynchronizeVmem
()
const
{
__builtin_amdgcn_s_waitcnt
(
0x0f70
);
// s_waitcnt vmcnt(0)
__builtin_amdgcn_s_barrier
();
// s_barrier
}
const
index_t
tile_count_
;
...
...
@@ -187,7 +193,7 @@ class StridedReductionTileLoop
index_t
tile_id_
;
index_t
block_tile_idx_
;
workgroup_barrier
finished_block_flags_
;
index_t
is_sync_needed_
;
bool
is_sync_needed_
;
};
}
// namespace ck
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment