Commit 999ac250 authored by Adam Osewski's avatar Adam Osewski
Browse files

Force vmem synchronization.

parent 277ad347
...@@ -161,6 +161,9 @@ __global__ void ...@@ -161,6 +161,9 @@ __global__ void
// do CShuffle in flight with loading partials products of other peer workgroups. // do CShuffle in flight with loading partials products of other peer workgroups.
GridwiseGemm::StorePartials(p_workspace, static_cast<void*>(p_shared), results_buffer); GridwiseGemm::StorePartials(p_workspace, static_cast<void*>(p_shared), results_buffer);
work_scheduler.FlagFinished(); work_scheduler.FlagFinished();
// When above would be done, then this will be necessary only when there are multiple
// WGPs reducing along K-dim.
work_scheduler.SynchronizeVmem();
// The workgroup which processed first K tile accumulates results and stores to GMEM // The workgroup which processed first K tile accumulates results and stores to GMEM
if(b2c_tile_map.IsFirstKSplitBlock()) if(b2c_tile_map.IsFirstKSplitBlock())
......
...@@ -39,7 +39,7 @@ class StridedReductionTileLoop ...@@ -39,7 +39,7 @@ class StridedReductionTileLoop
tile_id_{__builtin_amdgcn_readfirstlane(get_block_1d_id() * tiles_per_block_)}, tile_id_{__builtin_amdgcn_readfirstlane(get_block_1d_id() * tiles_per_block_)},
block_tile_idx_{__builtin_amdgcn_readfirstlane(0)}, block_tile_idx_{__builtin_amdgcn_readfirstlane(0)},
finished_block_flags_{p_flags}, finished_block_flags_{p_flags},
is_sync_needed_{1} is_sync_needed_{true}
{ {
} }
...@@ -179,7 +179,13 @@ class StridedReductionTileLoop ...@@ -179,7 +179,13 @@ class StridedReductionTileLoop
__device__ void SetIsSyncNeeded(index_t next_k_tiles, index_t k_tiles) __device__ void SetIsSyncNeeded(index_t next_k_tiles, index_t k_tiles)
{ {
is_sync_needed_ = __builtin_amdgcn_readfirstlane(next_k_tiles == k_tiles ? 0 : 1); is_sync_needed_ = (next_k_tiles == k_tiles ? 0 : 1);
}
__device__ void SynchronizeVmem() const
{
__builtin_amdgcn_s_waitcnt(0x0f70); // s_waitcnt vmcnt(0)
__builtin_amdgcn_s_barrier(); // s_barrier
} }
const index_t tile_count_; const index_t tile_count_;
...@@ -187,7 +193,7 @@ class StridedReductionTileLoop ...@@ -187,7 +193,7 @@ class StridedReductionTileLoop
index_t tile_id_; index_t tile_id_;
index_t block_tile_idx_; index_t block_tile_idx_;
workgroup_barrier finished_block_flags_; workgroup_barrier finished_block_flags_;
index_t is_sync_needed_; bool is_sync_needed_;
}; };
} // namespace ck } // namespace ck
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment