feat: merge dcu branch features

cf6e11c9 · qisan · 3f27f85a · d0436b7b · cf6e11c9 · cf6e11c9
Commit cf6e11c9 authored Feb 05, 2026 by qisan
20 changed files
--- a/docs/_static/custom.css
+++ b/docs/_static/custom.css
+/* Reduce the displayed size of the sidebar logo in Furo */
+.sidebar-logo {
+  max-height: 125px;
+  width: auto;
+}
+
+/* Optional: keep container from growing too tall due to spacing */
+.sidebar-logo-container {
+  line-height: 0;
+}
+
--- a/docs/_static/img/LayoutInference.png
+++ b/docs/_static/img/LayoutInference.png
--- a/docs/_static/img/MatmulExample.png
+++ b/docs/_static/img/MatmulExample.png
--- a/docs/_static/img/Parallel.png
+++ b/docs/_static/img/Parallel.png
--- a/docs/_static/img/ir_transform_diagram.png
+++ b/docs/_static/img/ir_transform_diagram.png
--- a/docs/_static/img/logo-row.svg
+++ b/docs/_static/img/logo-row.svg
--- a/docs/_static/img/logo-v2.png
+++ b/docs/_static/img/logo-v2.png
--- a/docs/_static/img/logo.png
+++ b/docs/_static/img/logo.png
--- a/docs/_static/img/mla_hopper/bs128_float16.png
+++ b/docs/_static/img/mla_hopper/bs128_float16.png
--- a/docs/_static/img/mla_hopper/bs64_float16.png
+++ b/docs/_static/img/mla_hopper/bs64_float16.png
--- a/docs/_static/img/mla_hopper/pv_layout.jpg
+++ b/docs/_static/img/mla_hopper/pv_layout.jpg
--- a/docs/_static/img/mla_hopper/qk_layout.jpg
+++ b/docs/_static/img/mla_hopper/qk_layout.jpg
--- a/docs/_static/img/op_benchmark_consistent_gemm_fp16.png
+++ b/docs/_static/img/op_benchmark_consistent_gemm_fp16.png
--- a/docs/_static/img/overview.png
+++ b/docs/_static/img/overview.png
--- a/docs/_static/img/software_pipeline_inference.png
+++ b/docs/_static/img/software_pipeline_inference.png
--- a/docs/_static/img/sparse_mma_storage_example.png
+++ b/docs/_static/img/sparse_mma_storage_example.png
--- a/docs/compiler_internals/inject_fence_proxy.md
+++ b/docs/compiler_internals/inject_fence_proxy.md
+# InjectFenceProxy Pass
+
+`tl.InjectFenceProxy` is a TIR-level transform that keeps the GPU proxy state consistent on NVIDIA Hopper (SM90+) by inserting `fence.proxy.async` instructions when control flow switches from generic memory operations to asynchronous proxy operations.
+
+## Why Fences Are Needed
+
+Hopper separates memory instructions into generic and asynchronous proxy paths. When an asynchronous instruction (for example, `cp.async` or `tma.load`) issues after generic traffic (like `ldmatrix` or plain buffer stores), the hardware requires a `fence.proxy.async` to guarantee ordering. Missing fences can lead to race conditions or undefined behavior.
+
+## What the Pass Does
+
+- Walks every statement in the `PrimFunc`, tracking whether it behaves as a **generic**, **async**, or **neutral** proxy (neutral statements reset the state, such as an explicit fence).
+- Automatically lowers `tma_store` intrinsics into the required `arrive`/`wait` handshake so that TMA stores participate correctly in synchronization.
+- Injects an explicit `fence.proxy.async` whenever a generic statement is followed by an async statement without an intervening neutral barrier.
+
+The pass is conservative: unknown extern calls are treated as async so that the fence is inserted rather than accidentally omitted.
+
+### Timeline View
+
+```
+generic initialize_wgmma_descriptor → generic shared-store → async wgmma
+             │                           │                   │
+             └─ generic proxy            ┴─ generic proxy    ┴─ async proxy
+                         │        fence inserted here   ↑
+                         └──────────────────────────────┘
+```
+
+The proxy tracker scans the sequence from left to right. The moment it detects a transition from generic to async (between the store and `cp.async` above), it synthesizes a `fence.proxy.async` to reset the hardware proxy state before the async path runs.
+
+## Coverage of Intrinsics
+
+The tracker understands the TileLang intrinsics for TMA load/store, shared-memory MMA (`wgmma`), and TVM/PTX async copy intrinsics (`cp.async` variants). Generic operations currently include `ldmatrix`, `stmatrix`, and descriptor initialization. Other IR nodes (loops, blocks, attributes) receive a proxy kind derived from their bodies so that the analysis survives structured control flow.
+
+## Usage
+
+The pass is part of the default TileLang lowering pipeline. To apply it manually:
+
+```python
+from tilelang import tl
+from tvm import IRModule
+
+mod = IRModule({"main": prim_func})
+with tvm.transform.PassContext():
+    mod = tl.transform.InjectFenceProxy()(mod)
+```
+
+## End-to-End Example
+
+Before the pass:
+
+```python
+@T.prim_func
+def kernel():
+    with T.Kernel(1):
+        desc = T.decl_buffer((1,), "uint64", scope="local.descriptor")
+        smem = T.decl_buffer((128,), "float16", scope="shared")
+        T.initialize_wgmma_descriptor(desc, T.uint64(0), 2, 1, 32)
+        smem[0] = T.float16(0)
+        T.ptx_wgmma_ss(
+            "float16",
+            "m64n64k16",
+            T.bool(True),
+            T.bool(True),
+            "fp16",
+            "fp16",
+            "fp16",
+            desc.data,
+            T.int32(0),
+            desc.data,
+            T.int32(0),
+            smem.data,
+            T.int32(0),
+            T.bool(True),
+            1,
+            1,
+        )
+```
+
+After `tl.transform.InjectFenceProxy`:
+
+```python
+@T.prim_func
+def kernel():
+    with T.Kernel(1):
+        desc = T.decl_buffer((1,), "uint64", scope="local.descriptor")
+        smem = T.decl_buffer((128,), "float16", scope="shared")
+        T.initialize_wgmma_descriptor(desc, T.uint64(0), 2, 1, 32)
+        smem[0] = T.float16(0)
+        T.fence_proxy_async()
+        T.ptx_wgmma_ss(
+            "float16",
+            "m64n64k16",
+            T.bool(True),
+            T.bool(True),
+            "fp16",
+            "fp16",
+            "fp16",
+            desc.data,
+            T.int32(0),
+            desc.data,
+            T.int32(0),
+            smem.data,
+            T.int32(0),
+            T.bool(True),
+            1,
+            1,
+        )
+```
+
+The only change is the `fence_proxy_async` between the generic descriptor setup / shared-memory write and the async `wgmma`. In larger kernels the pass performs the same operation across nested blocks, loops, and conditional branches.
+
+## Extending the Pass
+
+If you introduce a new intrinsic that behaves like an async proxy, add it to `IsAsyncIntrinsic` in `src/transform/inject_fence_proxy.cc`. Likewise, extend `IsKnownGeneric` for additional generic operations. When adding new neutral barriers, make sure they set the proxy kind to `kNeutral` so the state resets correctly.
--- a/docs/compiler_internals/letstmt_inline.md
+++ b/docs/compiler_internals/letstmt_inline.md
+# LetStmt Inlining in TileLang
+
+This document explains how `LetStmt` inlining works in TileLang's simplification pipeline, which is an important optimization that affects code generation and performance.
+
+## Overview
+
+A `LetStmt` (Let Statement) is a temporary variable binding in the IR (Intermediate Representation). During compilation, TileLang's simplifier may choose to inline these temporary variables to simplify the code. TileLang also provides a standalone `LetInline` pass that performs eager substitution before the main legalization pipeline. However, not all `LetStmt` nodes can be safely inlined.
+
+## When Does LetStmt Get Inlined?
+
+The inlining logic is implemented in `src/transform/simplify.cc`. A `LetStmt` will be inlined if **both** of the following conditions are met:
+
+### 1. The value satisfies `CanInlineLetStmt`
+
+The `CanInlineLetStmt` helper returns `true` when:
+
+- **The value is a constant** (`is_const_number(op->value)` returns true)
+- **The value is a variable** (`op->value.as<VarNode>()` returns a node)
+- **The value is an integer expression without side effects**:
+  - The value has `int` dtype
+  - The side effect level is `kPure` or lower (no observable side effects)
+
+```cpp
+bool CanInlineLetStmt(const LetStmtNode *op) {
+  if (is_const_number(op->value))
+    return true;
+  if (op->value.as<VarNode>())
+    return true;
+  // Won't face the deep expression explosion problem as in Let expression.
+  // attempt to inline as much as possible if the value integer type(can be
+  // index).
+  if (!op->value.dtype().is_int())
+    return false;
+  return SideEffect(op->value) <= CallEffectKind::kPure;
+}
+```
+
+### 2. The variable is NOT used in buffer definitions
+
+Even if `CanInlineLetStmt` returns true, the variable will **not** be inlined if it's used in a buffer's definition (shape, strides, elem_offset, or data fields).
+
+This protection exists because:
+- Buffer definitions are not updated during the simplification pass
+- If a variable used in a buffer definition is inlined, later references to that buffer would fail to find the variable definition
+- This would cause compilation errors or incorrect behavior
+
+The mutator checks this before dropping the binding:
+
+```cpp
+bool used_in_buffer_def = used_in_buffer_def_.count(op->var.get());
+
+if (can_inline && !used_in_buffer_def) {
+    return body;  // Inline: remove LetStmt and return body directly
+}
+```
+
+## Example: Why Buffer Definition Variables Are Protected
+
+Consider this code:
+
+```python
+let stride = M * 16
+let buffer_a = Buffer(data, shape=[M, N], strides=[stride, 1])
+buffer_a[i, j] = ...
+```
+
+- `stride` satisfies `CanInlineLetStmt` (it's an int expression with no side effects)
+- However, `stride` is used in `buffer_a`'s `strides` field
+- If we inline it, the buffer definition becomes `strides=[M*16, 1]`
+- But the Buffer object's fields are not updated during simplification
+- Later code accessing `buffer_a` would fail to find the `stride` variable
+
+Therefore, `stride` is added to `used_in_buffer_def_` and will **not** be inlined.
+
+## How Variables Are Collected
+
+The `CollectVarsUsedInBufferDefinition` helper traverses all `BufferLoad` and `BufferStore` nodes and collects variables used in their buffer definitions:
+
+```cpp
+void VisitBuffer(const Buffer &buf) {
+  // Collect variables that should remain defined
+  VarUseDefAnalyzer usage(Array<Var>{});
+  usage(buf->data);
+  for (const auto &dim : buf->shape) {
+    usage(dim);
+  }
+  for (const auto &dim : buf->strides) {
+    usage(dim);
+  }
+  usage(buf->elem_offset);
+
+  // Track for use in LetStmtNode mutator
+  for (const auto &var : usage.undefined_) {
+    used_in_buffer_def_.insert(var.get());
+  }
+}
+```
+
+## Practical Example: Temporary Variable Issue
+
+Consider this TileLang code:
+
+```python
+for i in T.Parallel(block_N):
+    idx = bx * block_N + i
+    tmp = T.max(A[idx], 1)
+    B[idx] = tmp / 2
+    A[idx] = tmp * 2
+```
+
+In this case:
+- `tmp` is an integer-like temporary variable
+- It satisfies `CanInlineLetStmt` (pure int expression)
+- It's **not** used in any buffer definition
+- Therefore, `tmp` **will be inlined**
+
+This means the IR becomes:
+
+```python
+for i in T.Parallel(block_N):
+    idx = bx * block_N + i
+    B[idx] = T.max(A[idx], 1) / 2
+    A[idx] = T.max(A[idx], 1) * 2
+```
+
+If this causes issues (e.g., `A[idx]` being read twice with different values due to the first write), it indicates a potential problem with the inlining heuristic or the code pattern.
+
+## Controlling Let Inlining via Pass Config
+
+TileLang exposes an explicit pass configuration key, `tilelang.PassConfigKey.TL_FORCE_LET_INLINE` (`"tl.force_let_inline"`), that allows users to force the eager `LetInline` pass to run before the legalization pipeline begins. When enabled, the pipeline invokes `tilelang.transform.LetInline()` at the start of `LowerAndLegalize` (see `tilelang/engine/phase.py`). This knob is useful when debugging LetStmt-related issues or when deterministic inlining behavior is desired across different environments.
+
+```python
+from tilelang import transform
+from tilelang.engine.phase import LowerAndLegalize
+
+with transform.PassContext(
+    config={transform.PassConfigKey.TL_FORCE_LET_INLINE: True}
+):
+    lowered_mod = LowerAndLegalize(input_mod, target)
+```
+
+If the flag is left unset (the default), the eager pass is only applied when downstream transforms opt in (for example, by calling `_Simplify(..., inline_let=True)` inside Tile operators). The guard in `tilelang/engine/phase.py` ensures the eager pass is only triggered when the user explicitly requests it.
+
+## Summary
+
+The LetStmt inlining mechanism is a **conservative optimization** that:
+1. Aggressively inlines simple, pure integer expressions to simplify the IR
+2. Protects variables used in buffer definitions to avoid breaking buffer access
+3. Helps reduce IR complexity and improve code generation
+4. Can be forced through `TL_FORCE_LET_INLINE` when deterministic eager inlining is required
+
+Understanding when inlining happens is crucial for:
+- Debugging compilation issues
+- Understanding generated code
+- Writing efficient TileLang programs
+- Identifying potential optimization opportunities or bugs
+
+## Related Files
+
+- `src/transform/simplify.cc`: Main Simplify implementation
+- `src/transform/frontend_legalize.cc`: Standalone LetInline pass
+- `tilelang/engine/phase.py`: Pipeline integration for eager LetInlining
+- `testing/python/transform/test_tilelang_transform_let_inline.py`: Regression coverage for the pass
--- a/docs/compiler_internals/tensor_checks.md
+++ b/docs/compiler_internals/tensor_checks.md
+# Tensor Checks (Host-Side Auto-Validation)
+
+This page explains the host-side checks that TileLang automatically inserts into the generated host stub for kernels. When you pass `torch.Tensor` or any DLPack-compatible object to a TileLang kernel, the host stub validates argument count, pointer kinds, dtype, shape, strides, device, and more — so you don’t need to handwrite Python checks. This keeps the ABI stable and significantly reduces Python overhead compared to doing equivalent checks in Python or via pybind.
+
+## Why Host-Side Checks
+- ABI stability: the entry is based on TVM FFI + DLPack, consistently accepting tensors and scalars.
+- Lower overhead: shifting checks from Python into C reduces interpreter/property-access costs; the call overhead is lower than pybind-based approaches.
+- Focused error reporting: assertions are raised close to the call site with precise “which field failed” messages.
+
+## How To Inspect Host Source
+You can inspect the auto-generated host source (with all checks and the final device-kernel call) for debugging:
+
+```python
+print(matmul_relu_kernel.get_host_source())
+```
+
+---
+
+## What The Host Checks
+
+### 1) Argument count and pointer kind
+- `num_args` must match the number of formal parameters; otherwise the kernel returns `-1` with an error message.
+- Each argument’s FFI type must be a pointer kind (for DLTensor/handle) or a valid scalar type; otherwise you’ll see errors like `Expect arg[i] to be pointer` or a scalar type error.
+
+### 2) Tensor checks (per tensor, after nullability decision)
+- Nullability
+  - If the tensor is “statically reachable/used” by the function body, the handle must be non-NULL; otherwise: `xxx is expected to have non-NULL pointer`.
+  - If an input tensor is not used by the function (statically unreachable), NULL is allowed; other field checks are executed only when `handle != NULL`.
+- Rank (`ndim`)
+  - Runtime `ndim` must equal the compile-time rank.
+- Data type (`dtype`)
+  - Match the triple `(code, bits, lanes)` with tolerance:
+    - `float8_e4m3`: accept `e4m3`, `e4m3fn`, `e4m3fnuz`.
+    - `float8_e5m2`: accept `e5m2`, `e5m2fnuz`.
+    - `bool`: accept `int8/uint8` with `bits=8` (same lanes), `kDLBool(code=6, bits=1 or 8)`, and any `bitwidth=1` (lanes must match).
+  - For packed-bit dtypes (e.g., `Int(1)`, `Int(4)`, `UInt(4)`), strict dtype checking is skipped.
+- Shape
+  - Each runtime dimension is bound to the compile-time shape (constants or symbols) and checked for consistency.
+  - Linear equations among symbolic dims can be solved on the fly (when there’s only one unknown at a given check point), enabling cross-tensor constraints.
+- Strides
+  - If `buffer_type = AutoBroadcast`: allow `strides == NULL` and derive strides from `shape`. If explicit `strides` is present, bind to compile-time constraints and check for equality.
+  - Otherwise: check per-dimension; if `strides == NULL`, derive from `shape` and compare (e.g., contiguous: `strides[-1] == 1`, `strides[-2] == shape[-1]`).
+- `byte_offset`
+  - Must be 0 (non-zero raises an error) to keep addressing simple and aligned.
+- Device info
+  - Assert `device_type == target backend` (CUDA/ROCM/Metal/OneAPI/WebGPU/CPU, etc.). Error messages include a DLPack code legend.
+  - When multiple tensors participate, assert that `device_id` matches across them.
+- Data pointer
+  - Must be non-NULL when the tensor is required to be non-null by the nullability rule.
+
+### 3) Scalar checks
+- `T.int*` family: require integer; error: `Expect arg[i] to be int`.
+- `T.bool`: require boolean; error: `Expect arg[i] to be boolean`.
+
+---
+
+## Shapes and Symbolic Equations: Linear Solving
+When shapes are symbolic, the host binds and (when possible) solves linear relations at runtime (only one unknown per check point). Example:
+
+```python
+@T.prim_func
+def main(
+    A: T.Tensor((m,), dtype),
+    B: T.Tensor((m + n,), dtype),
+    C: T.Tensor((n * k,), dtype),
+):
+    ...
+```
+
+This enables enforcing cross-tensor relationships like `len(B) == m + n` and `len(C) == n * k` at runtime.
+
+---
+
+## Nullability Rules and Examples
+Which tensors may be NULL?
+
+- Rule: If an input tensor is not used by the function under static analysis (i.e., the access is statically unreachable), it is considered Nullable; otherwise it must be non-NULL.
+- Examples:
+
+1) Must be non-NULL (used)
+```python
+@T.prim_func
+def main(A: T.Tensor((M, K), dtype)):
+    A[0] = 1
+```
+Passing `None` raises: `main.A_handle is expected to have non-NULL pointer`.
+
+2) Still must be non-NULL (constant-true branch)
+```python
+some_cond: bool = True
+@T.prim_func
+def main(A: T.Tensor((M, K), dtype)):
+    if some_cond:
+        A[0] = 1
+```
+
+3) Nullable (constant-false branch, statically unreachable)
+```python
+some_cond: bool = False
+@T.prim_func
+def main(A: T.Tensor((M, K), dtype)):
+    if some_cond:
+        A[0] = 1
+```
+
+4) Must be non-NULL (runtime condition)
+```python
+@T.prim_func
+def main(A: T.Tensor((M, K), dtype), some_cond: T.bool):
+    if some_cond:
+        A[0] = 1
+```
+Since `some_cond` is only known at runtime, static analysis cannot prove `A` is unused; `A` is thus non-nullable.
+
+---
+
+## Device Type Codes (DLPack)
+Supported and referenced device codes in error messages: `1=CPU, 2=CUDA, 7=Vulkan, 8=Metal, 10=ROCM, 14=OneAPI, 15=WebGPU`.
+Kernels assert that `device_type` matches the target backend, and require `device_id` consistency across tensors.
+
+---
+
+## Common Error Examples (What you’ll see)
+- Argument count mismatch (num_args)
+  - Trigger: missing/extra argument
+  - Error: `<kernel>: num_args should be N; expected: <num_args>, got: N`
+
+- Pointer-typed argument expected
+  - Trigger: scalar passed where a tensor is expected
+  - Error: `<kernel>: Expect arg[i] to be pointer`
+
+- Rank (ndim) mismatch
+  - Trigger: runtime rank differs from compile-time rank
+  - Error: `<kernel>.<name>.ndim is expected to equal R, but got mismatched ndim`
+
+- Dtype mismatch
+  - Trigger: dtype not equal to the compiled dtype and not within the tolerance set
+  - Error: `<kernel>.<name>.dtype is expected to be <dtype>, but got incompatible dtype`
+
+- Shape constraint violation
+  - Trigger: a dimension doesn’t match a constant/symbol binding
+  - Error: `Argument <kernel>.<name>.shape[i] has an unsatisfied constraint: ... == <expected>`
+
+- Strides check failed (e.g., non-contiguous layout)
+  - Trigger: transposed/sliced tensors that violate expected strides
+  - Error: `Argument <kernel>.<name>.strides[j] has an unsatisfied constraint: ... == <expected>`
+
+- Device type mismatch
+  - Trigger: calling a CUDA kernel with CPU tensors, etc.
+  - Error: `<kernel>.<name>.device_type mismatch [expected: <code> (<name>)] ...`
+
+- Device id mismatch
+  - Trigger: mixing tensors from different GPUs
+  - Error: `Argument <kernel>.<name>.device_id has an unsatisfied constraint: ... == ...`
+
+- NULL data pointer
+  - Trigger: tensor required to be non-null has a NULL data pointer
+  - Error: `<kernel>.<name> is expected to have non-NULL data pointer, but got NULL`
+
+- Scalar type mismatch
+  - Trigger: passing float to `T.int32`, or non-boolean to `T.bool`
+  - Error: `<kernel>: Expect arg[i] to be int/boolean`
+
+---
+
+## Troubleshooting Tips
+- Print the host source: `print(fn.get_host_source())` to see the exact assertion and expected vs. actual fields.
+- Fix strides: call `.contiguous()` for non-contiguous tensors, or avoid generating transposed/sliced layouts that break assumptions.
+- Align devices: ensure all participating tensors share the same `device_type` and `device_id`.
+- Align dtype: use `.to(<dtype>)` or construct tensors with the correct dtype; pay attention to `float8` and `bool` tolerance.
+- Dynamic shapes: ensure cross-tensor linear relations can be uniquely determined at the check point (only one unknown at a time).
+
+---
+
+## FAQ
+- Can I disable the checks?
+  - Not recommended and usually not supported. Checks are done on the host to preserve ABI stability and fail early close to the device call.
+- Is the overhead noticeable?
+  - The checks are lightweight (branches and field reads). Compared to Python-side checks, it’s faster; the dominating cost remains the Python→C boundary. Overall it’s cheaper than equivalent checks in Python.
+
+---
+
+## Reference Example (Matmul + ReLU)
+
+```python
+@T.prim_func
+def matmul_relu_kernel(
+    A: T.Tensor((M, K), dtype),
+    B: T.Tensor((K, N), dtype),
+    C: T.Tensor((M, N), dtype),
+):
+    # Initialize Kernel Context
+    with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+        A_shared = T.alloc_shared((block_M, block_K), dtype)
+        B_shared = T.alloc_shared((block_K, block_N), dtype)
+        C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+        T.clear(C_local)
+        for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=0):
+            T.copy(A[by * block_M, ko * block_K], A_shared)
+            T.copy(B[ko * block_K, bx * block_N], B_shared)
+            T.gemm(A_shared, B_shared, C_local)
+        T.copy(C_local, C[by * block_M, bx * block_N])
+
+# For debugging, print the host source
+print(matmul_relu_kernel.get_host_source())
+```
+
+The host will insert all checks described above for this example.
+
+---
+
+## Quick Error Reference (Short List)
+- Argument count
+  - Trigger: missing/extra args; Error: `num_args should be N; expected: <num_args>, got: N`.
+- Pointer kind
+  - Trigger: scalar passed to tensor arg; Error: `Expect arg[i] to be pointer`.
+- Rank (ndim)
+  - Trigger: runtime rank != compile-time; Error: `ndim ... expected to equal R`.
+- Dtype
+  - Trigger: mismatch and not tolerated; Error: `dtype ... expected to be <dtype>`.
+- Shape
+  - Trigger: constant/symbol binding violated; Error: `shape[i] ... == <expected>`.
+- Strides
+  - Trigger: layout mismatch; Error: `strides[j] ... == <expected>`.
+- Device type
+  - Trigger: wrong backend device; Error: `device_type mismatch [expected: ...]`.
+- Device id
+  - Trigger: tensors on different GPUs; Error: `device_id ... == ...`.
+- Data pointer
+  - Trigger: required non-NULL but NULL; Error: `non-NULL data pointer`.
+- Scalar types
+  - Trigger: wrong scalar type; Error: `Expect arg[i] to be int/boolean`.
+
+---
+
+## Host Error Troubleshooting (Minimal Repros)
+
+Below are minimal repro snippets for common host-side errors, assuming a CUDA-targeted kernel like `matmul_relu_kernel` with:
+
+```python
+# Convention:
+# A: float16 [M, K]
+# B: float16 [K, N]
+# C: float16 [M, N]
+# Target: CUDA (device_type=2)
+fn = matmul_relu_kernel  # your compiled function
+M = N = K = 1024
+```
+
+Adjust dtype/device if your kernel differs.
+
+### 0. Tip: print the host source
+```python
+print(fn.get_host_source())
+```
+
+### 1. num_args mismatch
+```python
+import torch
+
+A = torch.empty((M, K), device='cuda', dtype=torch.float16)
+B = torch.empty((K, N), device='cuda', dtype=torch.float16)
+# Missing C
+fn(A, B)
+```
+Expected: `<kernel>: num_args should be 3; expected: <num_args>, got: 3`.
+
+Fix: pass all arguments per the signature.
+
+### 2. Expect pointer (tensor) but got scalar
+```python
+import torch
+
+B = torch.empty((K, N), device='cuda', dtype=torch.float16)
+C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+fn(1, B, C)
+```
+Expected: `<kernel>: Expect arg[0] to be pointer`.
+
+Fix: pass a DLPack-compatible tensor (e.g., torch.Tensor).
+
+### 3. ndim mismatch
+```python
+import torch
+
+A = torch.empty((M, K, 1), device='cuda', dtype=torch.float16)  # rank=3
+B = torch.empty((K, N), device='cuda', dtype=torch.float16)
+C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+fn(A, B, C)
+```
+Expected: `<kernel>.A_handle.ndim is expected to equal 2, but got mismatched ndim`.
+
+Fix: ensure runtime rank equals compiled rank.
+
+### 4. dtype mismatch
+```python
+import torch
+
+A = torch.empty((M, K), device='cuda', dtype=torch.float32)  # should be float16
+B = torch.empty((K, N), device='cuda', dtype=torch.float16)
+C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+fn(A, B, C)
+```
+Expected: `<kernel>.A_handle.dtype is expected to be float16, but got incompatible dtype`.
+
+Fix: `A = A.to(torch.float16)` or create with the correct dtype.
+
+### 5. Shape constant/symbol mismatch
+```python
+import torch
+
+A = torch.empty((M, K + 1), device='cuda', dtype=torch.float16)  # K mismatched
+B = torch.empty((K, N), device='cuda', dtype=torch.float16)
+C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+fn(A, B, C)
+```
+Expected: `Argument <kernel>.A_handle.shape[i] has an unsatisfied constraint: ... == <expected>`.
+
+Fix: satisfy linear constraints and constants across tensors.
+
+### 6. Strides check failure (non-contiguous)
+```python
+import torch
+
+A = torch.empty((M, K), device='cuda', dtype=torch.float16)
+A_nc = A.t()  # transpose -> non-contiguous
+B = torch.empty((K, N), device='cuda', dtype=torch.float16)
+C = torch.empty((M, N), device='cuda', dtype=torch.float16)
+fn(A_nc, B, C)
+```
+Expected: `Argument <kernel>.A_handle.strides[1] has an unsatisfied constraint: ... == 1`.
+
+Fix: pass `A_nc.contiguous()` or align the layout expectation in the kernel.
+
+### 7. device_type mismatch
+```python
+import torch
+
+A = torch.empty((M, K), device='cpu', dtype=torch.float16)
+B = torch.empty((K, N), device='cpu', dtype=torch.float16)
+C = torch.empty((M, N), device='cpu', dtype=torch.float16)
+fn(A, B, C)  # CUDA-targeted kernel
+```
+Expected: `<kernel>.A_handle.device_type mismatch [expected: 2 (cuda)] ...`.
+
+Fix: move tensors to the CUDA device.
+
+### 8. device_id mismatch (multi-GPU)
+```python
+import torch
+
+A = torch.empty((M, K), device='cuda:0', dtype=torch.float16)
+B = torch.empty((K, N), device='cuda:1', dtype=torch.float16)
+C = torch.empty((M, N), device='cuda:0', dtype=torch.float16)
+fn(A, B, C)
+```
+Expected: `Argument <kernel>.B_handle.device_id has an unsatisfied constraint: ... == ...`.
+
+Fix: place all tensors on the same GPU (e.g., `cuda:0`).
+
+### 9. NULL data pointer (advanced)
+This usually comes from hand-constructed DLTensor/NDArray, or external frameworks passing unallocated/freed storage. Regular `torch.Tensor` allocations rarely hit this.
+
+Expected: `<kernel>.<name> is expected to have non-NULL data pointer, but got NULL`.
+
+Fix: ensure valid underlying storage; in PyTorch scenarios, avoid constructing tensors from invalid external handles.
+
+### 10. Scalar type mismatch (int / bool)
+```python
+import tilelang.language as T
+
+@T.prim_func
+def scalar_check(x: T.int32, flag: T.bool()):
+    T.evaluate(0)
+
+scalar_check(1.0, True)  # x is float -> Expect arg[0] to be int
+scalar_check(1, 2.5)     # flag is float -> Expect arg[1] to be boolean
+```
+
+Fix: pass correct scalar types, e.g., `scalar_check(1, True)`.
+
+---
+
+## Closing Notes
+- Cross-check “shape / strides / device / dtype” against the kernel signature to localize issues efficiently.
+- For complex symbolic relations, print the host source to confirm binding/solving order, then adjust runtime shapes/layouts accordingly.
+
--- a/docs/conf.py
+++ b/docs/conf.py
+# General information about the project.
+project = "TileLang <br>"
+author = "Tile Lang Contributors"
+copyright = f"2025-2025, {author}"
+
+# Version information.
+with open("../VERSION") as f:
+    version = f.read().strip()
+release = version
+
+extensions = [
+    "sphinx_tabs.tabs",
+    "sphinx_toolbox.collapse",
+    "sphinxcontrib.httpdomain",
+    "sphinx.ext.napoleon",
+    "sphinx.ext.intersphinx",
+    "sphinx_reredirects",
+    "sphinx.ext.mathjax",
+    "myst_parser",
+    "autoapi.extension",
+]
+
+autoapi_type = "python"
+autoapi_dirs = ["../tilelang"]
+
+autoapi_options = [
+    "members",
+    "undoc-members",
+    "show-inheritance",
+    "show-module-summary",
+    "special-members",
+]
+autoapi_keep_files = False  # Useful for debugging the generated rst files
+
+autoapi_generate_api_docs = True
+
+autodoc_typehints = "description"
+
+autoapi_ignore = ["*language/ast*", "*version*", "*libinfo*", "*parser*"]
+
+source_suffix = {".rst": "restructuredtext", ".md": "markdown"}
+
+myst_enable_extensions = ["colon_fence", "deflist"]
+
+redirects = {"get_started/try_out": "../index.html#getting-started"}
+
+language = "en"
+
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "README.md", "**/*libinfo*", "**/*version*"]
+
+pygments_style = "sphinx"
+todo_include_todos = False
+
+# -- Options for HTML output ----------------------------------------------
+
+html_theme = "furo"
+templates_path = []
+html_static_path = ["_static"]
+html_css_files = ["custom.css"]
+footer_copyright = "© 2025-2026 TileLang"
+footer_note = " "
+
+html_theme_options = {"light_logo": "img/logo-v2.png", "dark_logo": "img/logo-v2.png"}
+
+header_links = [
+    ("Home", "https://github.com/tile-ai/tilelang"),
+    ("Github", "https://github.com/tile-ai/tilelang"),
+]
+
+html_context = {
+    "footer_copyright": footer_copyright,
+    "footer_note": footer_note,
+    "header_links": header_links,
+    "display_github": True,
+    "github_user": "tile-ai",
+    "github_repo": "tilelang",
+    "github_version": "main/docs/",
+    "theme_vcs_pageview_mode": "edit",
+}