[BugFix] Do not modify strict layout in common or relax level of layout...

[BugFix] Do not modify strict layout in common or relax level of layout inference. More conditions on layout checking (#653) * [BugFix] Do not modify strict layout in common or relax level of layout inference. More conditions on layout checking * Lint * test fix * Update CI workflow to install dependencies without user site packages - Modified the installation commands in the CI workflow to include the `--no-user` flag for both `requirements-dev.txt` and `requirements-test.txt`, ensuring that packages are installed in the virtual environment rather than the user site directory. * Update CI workflow to install pip without user site packages - Added the `--no-user` flag to the pip installation command in the CI workflow for both development and testing dependencies, ensuring that packages are installed within the virtual environment. * Update requirements-test.txt * reduce ci problem size, * Refactor example_mla_decode.py for consistent formatting and remove unused imports in test_example_mla_decode.py --------- Co-authored-by: LeiWang1999 <leiwang1999@outlook.com> Co-authored-by: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>

[BugFix] Do not modify strict layout in common or relax level of layout...
[BugFix] Do not modify strict layout in common or relax level of layout inference. More conditions on layout checking (#653) * [BugFix] Do not modify strict layout in common or relax level of layout inference. More conditions on layout checking * Lint * test fix * Update CI workflow to install dependencies without user site packages - Modified the installation commands in the CI workflow to include the `--no-user` flag for both `requirements-dev.txt` and `requirements-test.txt`, ensuring that packages are installed in the virtual environment rather than the user site directory. * Update CI workflow to install pip without user site packages - Added the `--no-user` flag to the pip installation command in the CI workflow for both development and testing dependencies, ensuring that packages are installed within the virtual environment. * Update requirements-test.txt * reduce ci problem size, * Refactor example_mla_decode.py for consistent formatting and remove unused imports in test_example_mla_decode.py --------- Co-authored-by: LeiWang1999 <leiwang1999@outlook.com> Co-authored-by: Lei Wang <34334180+LeiWang1999@users.noreply.github.com>
fe6cdc9d · Zhengju Tang · GitHub · 8361eb5c · fe6cdc9d · fe6cdc9d
Unverified Commit fe6cdc9d authored Jul 24, 2025 by Zhengju Tang Committed by GitHub Jul 24, 2025
7 changed files
--- a/examples/deepseek_mla/example_mla_decode.py
+++ b/examples/deepseek_mla/example_mla_decode.py
@@ -276,16 +276,14 @@ def ref_program(q, q_pe, kv, k_pe, glse, Output_partial):
    return out
-def main():
+def main(
-    parser = argparse.ArgumentParser()
+    batch=1,
-    parser.add_argument('--batch', type=int, default=132, help='batch size')
+    heads=128,
-    parser.add_argument('--heads', type=int, default=128, help='q heads number')
+    kv_heads=1,
-    parser.add_argument('--kv_heads', type=int, default=1, help='kv heads number')
+    kv_ctx=8192,
-    parser.add_argument('--kv_ctx', type=int, default=8192, help='kv context length')
+    dim=512,
-    parser.add_argument('--dim', type=int, default=512, help='head dim')
+    pe_dim=64,
-    parser.add_argument('--pe_dim', type=int, default=64, help='pe head dim')
+):
-    args = parser.parse_args()
-    batch, heads, kv_heads, kv_ctx, dim, pe_dim = args.batch, args.heads, args.kv_heads, args.kv_ctx, args.dim, args.pe_dim
    qk_flops = 2 * batch * heads * kv_ctx * (dim + pe_dim)
    pv_flops = 2 * batch * heads * kv_ctx * dim
    total_flops = qk_flops + pv_flops
@@ -302,4 +300,13 @@ def main():
 if __name__ == "__main__":
-    main()
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--batch', type=int, default=132, help='batch size')
+    parser.add_argument('--heads', type=int, default=128, help='q heads number')
+    parser.add_argument('--kv_heads', type=int, default=1, help='kv heads number')
+    parser.add_argument('--kv_ctx', type=int, default=8192, help='kv context length')
+    parser.add_argument('--dim', type=int, default=512, help='head dim')
+    parser.add_argument('--pe_dim', type=int, default=64, help='pe head dim')
+    args = parser.parse_args()
+    batch, heads, kv_heads, kv_ctx, dim, pe_dim = args.batch, args.heads, args.kv_heads, args.kv_ctx, args.dim, args.pe_dim
+    main(batch, heads, kv_heads, kv_ctx, dim, pe_dim)
--- a/examples/deepseek_mla/test_example_mla_decode.py
+++ b/examples/deepseek_mla/test_example_mla_decode.py
 import tilelang.testing
 import example_mla_decode
-from unittest import mock
-import sys
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_example_mla_decode():
-    with mock.patch.object(sys, 'argv', ["example_mla_decode.py"]):
+    example_mla_decode.main()
-        example_mla_decode.main()
 if __name__ == "__main__":

--- a/examples/flash_attention/example_gqa_bwd.py
+++ b/examples/flash_attention/example_gqa_bwd.py
@@ -302,9 +302,9 @@ def ref_program(Q, K, V, is_causal, groups=1):
    return output
-def main(BATCH: int = 8,
+def main(BATCH: int = 1,
         H: int = 32,
-         N_CTX: int = 1024,
+         N_CTX: int = 256,
         D_HEAD_QK: int = 192,
         D_HEAD_V: int = 128,
         groups: int = 16,

--- a/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py
+++ b/examples/flash_attention/example_mha_fwd_bhsd_wgmma_pipelined.py
@@ -170,10 +170,10 @@ def ref_program(Q, K, V, is_causal):
 def main(
-    batch: int = 8,
+    batch: int = 1,
    heads: int = 32,
-    seq_q: int = 4096,
+    seq_q: int = 256,
-    seq_kv: int = 4096,
+    seq_kv: int = 256,
    dim: int = 128,
    is_causal: bool = False,
    tune: bool = False,

--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -29,4 +29,4 @@ attrs
 decorator
 flash-attn<=2.2.0
 scipy
 tornado
\ No newline at end of file
--- a/src/op/parallel.cc
+++ b/src/op/parallel.cc
@@ -294,8 +294,12 @@ LayoutMap ParallelOp::InferLayout(const LayoutInferArgs &T, InferLevel level) {
                              T.thread_bounds));
    }
-    // Layout infer conflict for local.fragment can noy be handled here
+    // Layout infer conflict for local.fragment can not be handled here
    // because the source_buffer is not always available
+    // (zhengju) do not modify strict layout even if it is conflict with the
+    // dst layout. This will not influence the result because the strict
+    // layout is usually with rep = 1 Since the real layout map is
+    // controlled by layout_inference.cc, we should add this check there
    if (buffer.scope() == "local.fragment" && source_buffer.defined() &&
        source_buffer.scope() == "local.fragment") {
      if (T.layout_map.count(buffer)) {

--- a/src/transform/layout_inference.cc
+++ b/src/transform/layout_inference.cc
@@ -153,10 +153,17 @@ public:
            }
          }
          // If already in map, ensure they are structurally equal
-          ICHECK(StructuralEqual()(layout, layout_map[buffer]))
+          // (zhengju) We can not modify the strict layout map when current
-              << "Get different layout for " << buffer
+          // level is not strict. This check should be done in certain
-              << "\n current layout: " << layout->DebugOutput()
+          // conditions, since the strict layout map is not updated in the
-              << "\n previous layout: " << layout_map[buffer]->DebugOutput();
+          // above code when current level is not strict
+          if (level == InferLevel::kStrict ||
+              !strict_layout_map.count(buffer)) {
+            ICHECK(StructuralEqual()(layout, layout_map[buffer]))
+                << "Get different layout for " << buffer
+                << "\n current layout: " << layout->DebugOutput()
+                << "\n previous layout: " << layout_map[buffer]->DebugOutput();
+          }
        } else {
          // Otherwise, update map
          layout_map.Set(buffer, layout);