Fix bug of concurrency and add test case which may fail orignally

798670d8 · rocking · e48ddb6a · 798670d8 · 798670d8
Commit 798670d8 authored Jul 11, 2022 by rocking
Showing with 6 additions and 1 deletion

include/ck/tensor_operation/gpu/grid/gridwise_layernorm.hpp include/ck/tensor_operation/gpu/grid/gridwise_layernorm.hpp +5 -0

test/layernorm/test_layernorm_util.hpp test/layernorm/test_layernorm_util.hpp +1 -1

No files found.
--- a/include/ck/tensor_operation/gpu/grid/gridwise_layernorm.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_layernorm.hpp
@@ -297,9 +297,14 @@ struct GridwiseLayernorm_mk_to_mk
        } while(reducedTiles < num_k_block_tile_iteration);
        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            if constexpr(I > 0)
+                block_sync_lds();
            BlockwiseSumReduce::Reduce(reduce_work_buf, mean_thread_buf(I));
            mean_thread_buf(I) = mean_thread_buf(I) / reduce_length;
+            block_sync_lds();
            BlockwiseSumReduce::Reduce(reduce_work_buf, mean_square_thread_buf(I));
            mean_square_thread_buf(I) = mean_square_thread_buf(I) / reduce_length;

--- a/test/layernorm/test_layernorm_util.hpp
+++ b/test/layernorm/test_layernorm_util.hpp
@@ -169,7 +169,7 @@ class TestLayernorm : public ::testing::Test
    }
    std::vector<std::vector<index_t>> lengths_ = {
-        {4, 256}, {8, 511}, {9, 1032}, {4, 2048}, {1, 8192}};
+        {4, 256}, {8, 511}, {9, 1032}, {4, 2048}, {1, 8192}, {4000, 2000}};
    std::vector<std::vector<index_t>> reduceDims_ = {{1}};