[Enhancement] Remove DeReplicate during parallel loop layout inference (#430)

* [Refactor] Adjust layout inference calculations in Gemm and ParallelOp * Updated block size calculation in Gemm to account for the range of thread bounds, improving accuracy in layout inference. * Simplified layout conflict error messages in ParallelOp for better clarity, enhancing debugging experience. * Removed redundant buffer checks in ParallelOp layout inference logic, streamlining the code. * [Refactor] Clean up layout inference logic in Gemm and ParallelOp * Removed unnecessary warning log in Gemm related to WGMMA conditions, streamlining the layout inference process. * Commented out redundant checks in ParallelOp's layout inference, improving code clarity while maintaining functionality. * Enhanced error messages in ParallelOp to provide clearer context for layout conflicts, aiding in debugging efforts. * lint fix

[Enhancement] Remove DeReplicate during parallel loop layout inference (#430)
* [Refactor] Adjust layout inference calculations in Gemm and ParallelOp * Updated block size calculation in Gemm to account for the range of thread bounds, improving accuracy in layout inference. * Simplified layout conflict error messages in ParallelOp for better clarity, enhancing debugging experience. * Removed redundant buffer checks in ParallelOp layout inference logic, streamlining the code. * [Refactor] Clean up layout inference logic in Gemm and ParallelOp * Removed unnecessary warning log in Gemm related to WGMMA conditions, streamlining the layout inference process. * Commented out redundant checks in ParallelOp's layout inference, improving code clarity while maintaining functionality. * Enhanced error messages in ParallelOp to provide clearer context for layout conflicts, aiding in debugging efforts. * lint fix
bb1a5fd8 · Lei Wang · LeiWang1999 · 97d63fab · bb1a5fd8 · bb1a5fd8
Commit bb1a5fd8 authored Apr 24, 2025 by Lei Wang Committed by LeiWang1999 Apr 24, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 16 additions and 20 deletions

src/op/gemm.cc src/op/gemm.cc +2 -5

src/op/parallel.cc src/op/parallel.cc +14 -15

No files found.
--- a/src/op/gemm.cc
+++ b/src/op/gemm.cc
@@ -161,7 +161,8 @@ LayoutMap Gemm::InferLayout(const LayoutInferArgs &T, InferLevel level) {
    return {};
  LayoutMap results;
  ICHECK(C.scope() == "local.fragment");
-  auto block_size = *as_const_int(T.thread_bounds->extent);
+  auto block_size = *as_const_int(T.thread_bounds->extent) -
+                    *as_const_int(T.thread_bounds->min);
  if (TargetIsVolta(T.target)) {
    const int warp_size = 32;
    auto [warp_m, warp_n] =
@@ -220,10 +221,6 @@ LayoutMap Gemm::InferLayout(const LayoutInferArgs &T, InferLevel level) {
  } else if (TargetIsHopper(T.target)) {
    const int warp_size = 32;
    bool maybe_wgmma = (this->M >= 64) && (block_size / warp_size % 4 == 0);
-    if (!maybe_wgmma) {
-      LOG(WARNING)
-          << "WGMMA is not enabled because M < 64 or block_size % 128 != 0";
-    }
    auto [warp_m, warp_n] =
        ComputeWarpPartition(block_size / warp_size, T.target, maybe_wgmma);
    auto fragment =

--- a/src/op/parallel.cc
+++ b/src/op/parallel.cc
@@ -181,24 +181,22 @@ LayoutMap ParallelOp::InferLayout(const LayoutInferArgs &T, InferLevel level) {
  };
  if (source_buffer.defined()) {
    loop_layout_ = compute_loop_layout_from_buffer(source_buffer);
-  } else if (read_source_buffer.defined()) {
-    loop_layout_ = compute_loop_layout_from_buffer(read_source_buffer);
  } else if (level == InferLevel::kFree) {
    if (read_source_buffer.defined()) {
      loop_layout_ = compute_loop_layout_from_buffer(read_source_buffer);
-      // Loop don't need to be replicated.
-      if (!is_one(loop_layout_->ReplicateExtent()))
-        loop_layout_ = loop_layout_->DeReplicate();
-      // if still has replication, add a condition
-      if (!is_one(loop_layout_->ReplicateExtent())) {
-        auto inv = loop_layout_->Inverse();
-        Array<PrimExpr> fwd;
-        for (size_t i = 0; i < loop_layout_->OutputDim(); i++)
-          fwd.push_back(0);
-        fwd.push_back(InputPlaceholder(0));
-        auto rep = inv->Forward(fwd).back();
-        AddPredicate(EQ(rep, 0));
-      }
+      // // Loop don't need to be replicated.
+      // if (!is_one(loop_layout_->ReplicateExtent()))
+      //   loop_layout_ = loop_layout_->DeReplicate();
+      // // if still has replication, add a condition
+      // if (!is_one(loop_layout_->ReplicateExtent())) {
+      //   auto inv = loop_layout_->Inverse();
+      //   Array<PrimExpr> fwd;
+      //   for (size_t i = 0; i < loop_layout_->OutputDim(); i++)
+      //     fwd.push_back(0);
+      //   fwd.push_back(InputPlaceholder(0));
+      //   auto rep = inv->Forward(fwd).back();
+      //   AddPredicate(EQ(rep, 0));
+      // }
    } else {
      // Vectorize Size must be aware of the buffer_remap
      // As the pass will do post processing to the layout
@@ -229,6 +227,7 @@ LayoutMap ParallelOp::InferLayout(const LayoutInferArgs &T, InferLevel level) {
  } else {
    return {};
  }
+
  // Step 2: Check that the loop's partition can correctly align with all source
  // fragment
  for (const auto &[buffer, _] : indice_map_) {