"vscode:/vscode.git/clone" did not exist on "18a81e356eef305b244db0e7c46772925a540a44"
Unverified Commit 278c0fbf authored by Lei Wang's avatar Lei Wang Committed by GitHub
Browse files

[Enhancement] Introduce a workaround for layout inference for local buffer store (#1055)



* [Enhancement] Improve layout inference for local buffer handling in parallel operations

* Added logic to check if a loop only manipulates "local" buffers, which affects thread binding decisions.
* Updated the condition for determining parallel loop execution to account for local buffer stores.
* Cleaned up comments for clarity and future considerations.

* [Refactor] Clean up parallel loop condition formatting in layout inference

* Reformatted the condition for determining parallel loop execution for better readability.
* Maintained existing logic while enhancing code clarity for future modifications.

---------
Co-authored-by: default avatarZhiwen Mo <zm125@ic.ac.uk>
parent 37b3dbde
......@@ -429,7 +429,6 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
}
}
});
if (read_source_buffer.defined() && allow_layout_propgate) {
loop_layout_ = compute_loop_layout_from_buffer(read_source_buffer);
}
......
......@@ -719,7 +719,23 @@ private:
// A_local[i] = A_global[i]
// Here, A_local is a register-local buffer held independently by each
// thread, so explicit thread binding is not required.
//
bool store_into_local = false;
PostOrderVisit(root, [&](const ObjectRef &obj) {
if (const auto *store = obj.as<BufferStoreNode>()) {
if (store->buffer.scope() == "local") {
store_into_local = true;
}
// if the case is like:
// for i in T.Parallel(1024):
// A_local[i] = B_global[i]
// A_frag[i] = A_global[i]
// exception will be raise in Parallel::LayoutInference
}
});
// This check if for the loop that only manuplates "local" buffers,
// for i in T.Parallel(1024):
// A_local[i] = B_local[i]
// Though this might be illegal
// We use PostOrderVisit to detect whether the loop only manuplates
// "local" buffers, which indicates register usage and justifies skipping
// thread binding.
......@@ -738,7 +754,9 @@ private:
auto loop_layout = result_.for_map[root];
// FIXME: tell in-Parallel and out-of-Parallel `local`s apart
bool parallel_loop = !skip_thread_partition_ && !local_register_only;
// NOTE(lei): a bit ugly, we should rethink about this part in future.
bool parallel_loop =
!skip_thread_partition_ && !local_register_only && !store_into_local;
if (parallel_loop) {
for_node =
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment