[Bugfix] Support `T.Parallel` with local register assignment (#395)

* make it python 3.8- happy * [Enhancement] Improve loop partitioning and vectorization logic in layout inference and loop vectorization - Enhanced the VisitStmt_ method to support local buffer handling in parallel loops, allowing for register usage without explicit thread binding. - Updated loop vectorization logic to simplify expressions and ensure accurate vector size calculations, improving performance and clarity in the vectorization process. * lint fix

[Bugfix] Support `T.Parallel` with local register assignment (#395)
* make it python 3.8- happy * [Enhancement] Improve loop partitioning and vectorization logic in layout inference and loop vectorization - Enhanced the VisitStmt_ method to support local buffer handling in parallel loops, allowing for register usage without explicit thread binding. - Updated loop vectorization logic to simplify expressions and ensure accurate vector size calculations, improving performance and clarity in the vectorization process. * lint fix
8c5b1341 · Lei Wang · LeiWang1999 · 192a3995 · 8c5b1341 · 8c5b1341
Commit 8c5b1341 authored Apr 15, 2025 by Lei Wang Committed by LeiWang1999 Apr 15, 2025
Showing with 40 additions and 16 deletions

src/transform/layout_inference.cc src/transform/layout_inference.cc +26 -5

src/transform/loop_vectorize.cc src/transform/loop_vectorize.cc +4 -8

tilelang/engine/param.py tilelang/engine/param.py +10 -3

No files found.
--- a/src/transform/layout_inference.cc
+++ b/src/transform/layout_inference.cc
@@ -529,16 +529,37 @@ private:
  Stmt VisitStmt_(const ForNode *op) final {
    For for_node = Downcast<For>(IRMutatorWithAnalyzer::VisitStmt_(op));
    if (result_.for_map.count(GetRef<For>(op))) {
-      auto loop_layout = result_.for_map[GetRef<For>(op)];
+      auto root = GetRef<For>(op);
-      if (!skip_thread_partition_) {
+      // This check is a workaround to support T.Parallel for local buffers.
-        // If none thread bindings are provided, partition the loop
+      // For example:
+      //   for i in T.Parallel(1024):
+      //     A_local[i] = A_global[i]
+      // Here, A_local is a register-local buffer held independently by each
+      // thread, so explicit thread binding is not required.
+      //
+      // We use PostOrderVisit to detect whether the buffer store targets a
+      // "local" buffer, which indicates register usage and justifies skipping
+      // thread binding.
+      bool is_register_store = false;
+      PostOrderVisit(root, [&](const ObjectRef &obj) {
+        if (const auto *store = obj.as<BufferStoreNode>()) {
+          if (store->buffer.scope() == "local") {
+            is_register_store = true;
+          }
+        }
+      });
+      bool parallel_loop = !is_register_store && !skip_thread_partition_;
+      if (parallel_loop) {
+        auto loop_layout = result_.for_map[root];
        for_node =
            PartitionLoop(for_node, thread_var_->var, analyzer_, loop_layout);
      }
+      // If none thread bindings are provided, partition the loop
      for_node = VectorizeLoop(for_node);
-      if (result_.predicate_map.count(GetRef<For>(op))) {
+      if (result_.predicate_map.count(root) && parallel_loop) {
-        return IfThenElse(result_.predicate_map[GetRef<For>(op)], for_node);
+        return IfThenElse(result_.predicate_map[root], for_node);
      } else {
        return for_node;
      }

--- a/src/transform/loop_vectorize.cc
+++ b/src/transform/loop_vectorize.cc
@@ -53,8 +53,6 @@ public:
  int Plan(const For &node) {
    this->operator()(node);
-    // Always Enable vectorization
-    // if (!has_nonlocal_memory_access_) return 1;
    return vector_size_;
  }
@@ -127,14 +125,12 @@ private:
    }
    // so we should disable this GCD optimization
    max_vector_size = arith::ZeroAwareGCD(max_vector_size, extent_ptr->value);
    auto last_dim = buffer->shape.back();
    auto mod_set = analyzer_.modular_set(last_dim);
    // when dynamic shape like [m, k]: coeff=1, base=0, GCD will block
    // conditionally tail vectorize
    if (buffer->shape.back().as<IntImmNode>()) {
      max_vector_size = arith::ZeroAwareGCD(max_vector_size, mod_set->coeff);
      auto gcd_base = arith::ZeroAwareGCD(max_vector_size, mod_set->base);
      // If gcd_base is equal to the last dimension,
      // we should analyze the second-to-last dimension
@@ -142,7 +138,6 @@ private:
      if (gcd_base < Downcast<IntImm>(last_dim)->value) {
        max_vector_size = gcd_base;
      }
      vector_size_ = arith::ZeroAwareGCD(max_vector_size, vector_size_);
      PrimExpr elem_offset = 0;
@@ -243,12 +238,13 @@ bool IndiceCanVectorize(PrimExpr expr, Var var, PrimExpr iter_var_size,
    return false;
  Var v0("v0"), v1("v1");
  analyzer->Bind(v0, Range(0, target_vectorized_size));
-  analyzer->Bind(v1, Range(0, FloorDiv(iter_var_size, target_vectorized_size)));
+  analyzer->Bind(v1, Range(0, analyzer->Simplify(FloorDiv(
+                                  iter_var_size, target_vectorized_size))));
  PrimExpr expr_transformed = analyzer->Simplify(
      Substitute(expr, {{var, v0 + v1 * target_vectorized_size}}));
  Vectorizer vectorizer(v0, IntImm(v0->dtype, target_vectorized_size));
-  PrimExpr expr_vectorized = vectorizer.VisitExpr(expr_transformed);
+  PrimExpr expr_vectorized =
+      analyzer->Simplify(vectorizer.VisitExpr(expr_transformed));
  auto ramp_node = expr_vectorized.as<RampNode>();
  if (!ramp_node) {
    // Broadcast value

--- a/tilelang/engine/param.py
+++ b/tilelang/engine/param.py
@@ -72,7 +72,10 @@ class KernelParam:
        Returns:
            bool: True if parameter is an unsigned integer type, False otherwise
        """
-        return str(self.dtype).removeprefix("torch.").startswith("uint")
+        dtype_str = str(self.dtype)
+        if dtype_str.startswith("torch."):
+            dtype_str = dtype_str[6:]
+        return dtype_str.startswith("uint")
    def is_float8(self) -> bool:
        """
@@ -81,7 +84,10 @@ class KernelParam:
        Returns:
            bool: True if parameter is a float8 type, False otherwise
        """
-        return str(self.dtype).removeprefix("torch.").startswith("float8")
+        dtype_str = str(self.dtype)
+        if dtype_str.startswith("torch."):
+            dtype_str = dtype_str[6:]
+        return dtype_str.startswith("float8")
    def is_boolean(self) -> bool:
        """
@@ -90,7 +96,8 @@ class KernelParam:
        Returns:
            bool: True if parameter is a boolean type, False otherwise
        """
-        return str(self.dtype).removeprefix("torch.").startswith("bool")
+        dtype_str = str(self.dtype)
+        return dtype_str[6:] if dtype_str.startswith("torch.") else dtype_str.startswith("bool")
 @dataclass