Commit 999ac250 authored by Adam Osewski's avatar Adam Osewski
Browse files

Force vmem synchronization.

parent 277ad347
......@@ -161,6 +161,9 @@ __global__ void
// do CShuffle in flight with loading partials products of other peer workgroups.
GridwiseGemm::StorePartials(p_workspace, static_cast<void*>(p_shared), results_buffer);
work_scheduler.FlagFinished();
// When above would be done, then this will be necessary only when there are multiple
// WGPs reducing along K-dim.
work_scheduler.SynchronizeVmem();
// The workgroup which processed first K tile accumulates results and stores to GMEM
if(b2c_tile_map.IsFirstKSplitBlock())
......
......@@ -39,7 +39,7 @@ class StridedReductionTileLoop
tile_id_{__builtin_amdgcn_readfirstlane(get_block_1d_id() * tiles_per_block_)},
block_tile_idx_{__builtin_amdgcn_readfirstlane(0)},
finished_block_flags_{p_flags},
is_sync_needed_{1}
is_sync_needed_{true}
{
}
......@@ -179,7 +179,13 @@ class StridedReductionTileLoop
__device__ void SetIsSyncNeeded(index_t next_k_tiles, index_t k_tiles)
{
is_sync_needed_ = __builtin_amdgcn_readfirstlane(next_k_tiles == k_tiles ? 0 : 1);
is_sync_needed_ = (next_k_tiles == k_tiles ? 0 : 1);
}
__device__ void SynchronizeVmem() const
{
__builtin_amdgcn_s_waitcnt(0x0f70); // s_waitcnt vmcnt(0)
__builtin_amdgcn_s_barrier(); // s_barrier
}
const index_t tile_count_;
......@@ -187,7 +193,7 @@ class StridedReductionTileLoop
index_t tile_id_;
index_t block_tile_idx_;
workgroup_barrier finished_block_flags_;
index_t is_sync_needed_;
bool is_sync_needed_;
};
} // namespace ck
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment