[Bugfix] Avoid duplicate data access when cross thread buffer meet replicate register (#606)

* [Enhancement] Improve debug output formatting in layout and fragment nodes - Updated the `DebugOutput` methods in `LayoutNode` and `FragmentNode` to provide more structured and informative output, including transformation details and thread range information. - Enhanced layout inference logic in `ParallelOp` to add predicates for cross-thread shared memory access, improving layout handling in parallel operations. - Minor adjustment in `layout_inference.cc` to ensure clarity in parallel loop handling. * lint fix

[Bugfix] Avoid duplicate data access when cross thread buffer meet replicate register (#606)
* [Enhancement] Improve debug output formatting in layout and fragment nodes - Updated the `DebugOutput` methods in `LayoutNode` and `FragmentNode` to provide more structured and informative output, including transformation details and thread range information. - Enhanced layout inference logic in `ParallelOp` to add predicates for cross-thread shared memory access, improving layout handling in parallel operations. - Minor adjustment in `layout_inference.cc` to ensure clarity in parallel loop handling. * lint fix
8df45c9d · Lei Wang · LeiWang1999 · a8811d9b · 8df45c9d · 8df45c9d
Commit 8df45c9d authored Jun 30, 2025 by Lei Wang Committed by LeiWang1999 Jun 30, 2025
Show whitespace changes
Inline Side-by-side

Showing with 44 additions and 18 deletions

src/layout/layout.cc src/layout/layout.cc +11 -8

src/op/parallel.cc src/op/parallel.cc +32 -10

src/transform/layout_inference.cc src/transform/layout_inference.cc +1 -0

No files found.
--- a/src/layout/layout.cc
+++ b/src/layout/layout.cc
@@ -370,19 +370,22 @@ Fragment FragmentNode::CondenseReplicateVar() const {
 std::string LayoutNode::DebugOutput() const {
  std::stringstream ss;
-  ss << "Layout Shape: " << InputShape() << " -> " << OutputShape() << " -> "
+  ss << "Layout(" << InputShape() << " -> " << OutputShape()
-     << GetForwardIndex();
+     << ", transform: " << GetForwardVars() << " -> " << GetForwardIndex()
+     << ")";
  return ss.str();
 }
 std::string FragmentNode::DebugOutput() const {
  std::stringstream ss;
-  ss << "Fragment Shape: " << InputShape() << " -> " << OutputShape();
+  ss << "Fragment(" << InputShape() << " -> " << OutputShape()
-  ss << " -> replicate: " << ReplicateExtent();
+     << ", replicate: " << ReplicateExtent() << ", thread: " << ThreadExtent()
-  ss << " -> thread: " << ThreadExtent();
+     << ", forward_thread: " << forward_thread_
-  ss << " -> forward_thread: " << forward_thread_;
+     << ", forward_index: " << GetForwardIndex();
-  ss << " -> forward_index: " << GetForwardIndex();
+  if (thread_range_.defined()) {
-  ss << " -> thread_range: " << thread_range_;
+    ss << ", thread_range: " << thread_range_;
+  }
+  ss << ")";
  return ss.str();
 }

--- a/src/op/parallel.cc
+++ b/src/op/parallel.cc
@@ -174,16 +174,38 @@ LayoutMap ParallelOp::InferLayout(const LayoutInferArgs &T, InferLevel level) {
      // // Loop don't need to be replicated.
      // if (!is_one(loop_layout_->ReplicateExtent()))
      //   loop_layout_ = loop_layout_->DeReplicate();
-      // // if still has replication, add a condition
-      // if (!is_one(loop_layout_->ReplicateExtent())) {
+      // For free layout inference
-      //   auto inv = loop_layout_->Inverse();
+      // If replication exists and buffer has cross-thread shared memory access,
-      //   Array<PrimExpr> fwd;
+      // add predicate
-      //   for (size_t i = 0; i < loop_layout_->OutputDim(); i++)
+      bool has_cross_thread_access = false;
-      //     fwd.push_back(0);
+      PostOrderVisit(root_, [&](const ObjectRef &obj) {
-      //   fwd.push_back(InputPlaceholder(0));
+        if (const auto *store = obj.as<BufferStoreNode>()) {
-      //   auto rep = inv->Forward(fwd).back();
+          // check if scope is shared or global
-      //   AddPredicate(EQ(rep, 0));
+          if (store->buffer.scope() == "shared" ||
-      // }
+              store->buffer.scope() == "shared.dyn" ||
+              store->buffer.scope() == "global") {
+            has_cross_thread_access = true;
+          }
+        } else if (const auto *load = obj.as<BufferLoadNode>()) {
+          // check if scope is shared or global
+          if (load->buffer.scope() == "shared" ||
+              load->buffer.scope() == "shared.dyn" ||
+              load->buffer.scope() == "global") {
+            has_cross_thread_access = true;
+          }
+        }
+      });
+      if (!is_one(loop_layout_->ReplicateExtent()) && has_cross_thread_access) {
+        auto inv = loop_layout_->Inverse();
+        Array<PrimExpr> fwd;
+        for (size_t i = 0; i < loop_layout_->OutputDim(); i++)
+          fwd.push_back(0);
+        fwd.push_back(InputPlaceholder(0));
+        auto rep = inv->Forward(fwd).back();
+        AddPredicate(EQ(rep, 0));
+      }
    } else {
      // Vectorize Size must be aware of the buffer_remap
      // As the pass will do post processing to the layout

--- a/src/transform/layout_inference.cc
+++ b/src/transform/layout_inference.cc
@@ -598,6 +598,7 @@ private:
      auto loop_layout = result_.for_map[root];
      bool parallel_loop = !is_register_store && !skip_thread_partition_;
      if (parallel_loop) {
        for_node =
            PartitionLoop(for_node, thread_var_->var, analyzer_, loop_layout);