Fix synchronization scheme.

b398481e · Adam Osewski · 91343ec1 · b398481e · b398481e
Commit b398481e authored Feb 29, 2024 by Adam Osewski
Showing with 5 additions and 3 deletions

include/ck/utility/work_scheduling.hpp include/ck/utility/work_scheduling.hpp +4 -2

test/work_scheduling/test_strided_reduction_tile_loop.cpp test/work_scheduling/test_strided_reduction_tile_loop.cpp +1 -1

No files found.
--- a/include/ck/utility/work_scheduling.hpp
+++ b/include/ck/utility/work_scheduling.hpp
@@ -116,7 +116,7 @@ class StridedReductionTileLoop
        index_t neighbour_count = 0;
        if(tiles_per_block_ < k_tiles)
        {
-            // Since we can have deviation (+1) in neighbours number
+            // Since we can have deviation (+/-1) in neighbours number
            // we calculate how many workgroups are needed to process the k-tiles left.
            neighbour_count = (k_tiles - k_tile_idx - 1 + tiles_per_block_ - 1) / tiles_per_block_;
        }
@@ -139,7 +139,9 @@ class StridedReductionTileLoop
        if(neighbour_count > 0)
        {
-            finished_block_flags_.wait_lt(
+            // Also count this workgroup
+            neighbour_count++;
+            finished_block_flags_.wait_eq(
                GetWorkgroupFlagIdx(k_tiles, output_tile_idx, output_tile_idx_offset),
                neighbour_count);
        }

--- a/test/work_scheduling/test_strided_reduction_tile_loop.cpp
+++ b/test/work_scheduling/test_strided_reduction_tile_loop.cpp
@@ -156,7 +156,7 @@ __global__ void grouped_gemm_naive_strided_tile_loop_reduce(const GemmArgDesc* p
            // Accumulate partial results. We can have different # of workgroups to reduce, thus we
            // read actual flag value.
-            for(index_t i = 1; i <= neighbour_count; ++i)
+            for(index_t i = 1; i < neighbour_count; ++i)
            {
                partial_result += p_workspace[(get_block_1d_id()) * MPerBlock * NPerBlock +
                                              i * MPerBlock * NPerBlock + get_thread_local_1d_id()];