[Enhancement] Introduce `T.__ldg` (#1414)

* [Enhancement] Add __ldg intrinsic for CUDA read-only cache loads * Introduced the __ldg intrinsic to enable explicit read-only cached loads from global memory in CUDA. * Updated the corresponding documentation and added support in both CUDA and HIP code generation. * Enhanced the Python interface for __ldg to accept BufferLoad and Buffer types, improving usability. * [Enhancement] Update formatting and linting rules in pyproject.toml; minor test adjustment * Added new formatting rules in pyproject.toml to enforce consistent code style, including hanging indents and argument splitting. * Updated test_tilelang_language_intrinsics_codegen.py to improve readability by adding a blank line before the main execution block. * Refactored error messages in builtin.py for better clarity and consistency, ensuring proper formatting in function definitions and raising ValueErrors. * lint fix

[Enhancement] Introduce `T.__ldg` (#1414)
* [Enhancement] Add __ldg intrinsic for CUDA read-only cache loads * Introduced the __ldg intrinsic to enable explicit read-only cached loads from global memory in CUDA. * Updated the corresponding documentation and added support in both CUDA and HIP code generation. * Enhanced the Python interface for __ldg to accept BufferLoad and Buffer types, improving usability. * [Enhancement] Update formatting and linting rules in pyproject.toml; minor test adjustment * Added new formatting rules in pyproject.toml to enforce consistent code style, including hanging indents and argument splitting. * Updated test_tilelang_language_intrinsics_codegen.py to improve readability by adding a blank line before the main execution block. * Refactored error messages in builtin.py for better clarity and consistency, ensuring proper formatting in function definitions and raising ValueErrors. * lint fix
6f67da84 · Lei Wang · GitHub · 34632a1b · 6f67da84 · 6f67da84
Unverified Commit 6f67da84 authored Dec 12, 2025 by Lei Wang Committed by GitHub Dec 12, 2025
7 changed files
--- a/src/op/builtin.cc
+++ b/src/op/builtin.cc
@@ -368,5 +368,10 @@ TIR_DEFINE_TL_BUILTIN(warp_reduce_bitor)
    .set_attr<TCallEffectKind>("TCallEffectKind",
                               Integer(CallEffectKind::kOpaque));
+// __ldg(BufferLoad | Buffer, idx?) -> value
+// Treat as a pure call that returns the loaded value.
+TIR_DEFINE_TL_BUILTIN(__ldg).set_num_inputs(-1).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kPure));
 } // namespace tl
 } // namespace tvm
--- a/src/op/builtin.h
+++ b/src/op/builtin.h
@@ -600,6 +600,24 @@ TVM_DLL const Op &warp_reduce_bitand();
 */
 TVM_DLL const Op &warp_reduce_bitor();
+/*!
+ * \brief tilelang intrinsic for CUDA read-only cache load (__ldg).
+ *
+ *  This op allows users to explicitly request a non-coherent cached load
+ *  from global memory on CUDA by emitting `__ldg(&ptr[idx])` for 32-bit
+ *  element types on supported architectures. It provides a direct way to
+ *  leverage the read-only data cache for performance-sensitive loads when
+ *  the compiler cannot infer `const __restrict__` automatically.
+ *
+ *  Usage from TVMScript:
+ *    y[i] = T.__ldg(x[i])
+ *
+ *  The op takes one argument preferred as a BufferLoad identifying the
+ *  source element; alternatively, backends may support passing a Buffer and
+ *  index expression.
+ */
+TVM_DLL const Op &__ldg();
 } // namespace tl
 } // namespace tvm

--- a/src/target/codegen_cuda.cc
+++ b/src/target/codegen_cuda.cc
@@ -2354,6 +2354,23 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
    stream << ": \"l\"((void*)(" << global_buffer << "+" << global_addr
           << ")), \"r\"((int)" << guard << ")\n";
    stream << ");\n";
+  } else if (op->op.same_as(tl::__ldg())) {
+    // Explicit read-only cached load. Preferred form: __ldg(BufferLoad(...)).
+    // Fallback form: __ldg(buffer, index)
+    const BufferLoadNode *bl = nullptr;
+    if (!op->args.empty()) {
+      bl = op->args[0].as<BufferLoadNode>();
+    }
+    if (bl == nullptr) {
+      LOG(FATAL) << "T.__ldg expects a BufferLoad as the first argument.";
+    }
+    const BufferNode *buffer = bl->buffer.get();
+    ICHECK_EQ(bl->indices.size(), 1)
+        << "T.__ldg currently supports flattened 1D buffer accesses.";
+    PrimExpr base = bl->indices[0];
+    // Emit __ldg(&buffer_ref)
+    auto buffer_ref = this->GetBufferRef(op->dtype, buffer, base);
+    os << "__ldg(&(" << buffer_ref << "))";
  } else if (op->op.same_as(builtin::reinterpret())) {
    DataType tgt_dtype = op->dtype;
    DataType src_dtype = op->args[0]->dtype;

--- a/src/target/codegen_hip.cc
+++ b/src/target/codegen_hip.cc
@@ -828,6 +828,16 @@ void CodeGenTileLangHIP::VisitExpr_(const CallNode *op, std::ostream &os) {
  } else if (op->op.same_as(tl::pack_b16())) {
    os << "__pack_half2(" << this->PrintExpr(op->args[0]) << ", "
       << this->PrintExpr(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::__ldg())) {
+    // HIP fallback: regular load
+    const BufferLoadNode *bl = op->args[0].as<BufferLoadNode>();
+    ICHECK(bl) << "T.__ldg expects a BufferLoad as the first argument.";
+    ICHECK_EQ(bl->indices.size(), 1)
+        << "T.__ldg currently supports flattened 1D buffer accesses.";
+    const BufferNode *buffer = bl->buffer.get();
+    PrimExpr base = bl->indices[0];
+    auto buffer_ref = this->GetBufferRef(op->dtype, buffer, base);
+    os << buffer_ref;
  } else if (op->op.same_as(builtin::tvm_fill_fragment())) {
    need_mma_h_ = true;
    ICHECK_EQ(op->args.size(), 6U);

--- a/testing/python/language/test_tilelang_language_intrinsics_codegen.py
+++ b/testing/python/language/test_tilelang_language_intrinsics_codegen.py
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+@tilelang.testing.requires_cuda
+def test_language_ldg_codegen():
+    N = 128
+    @T.prim_func
+    def main(
+            x: T.Tensor((N,), "float32"),
+            y: T.Tensor((N,), "float32"),
+    ):
+        with T.Kernel(N, threads=32) as pid:
+            # Explicitly request read-only cache load for x[pid]
+            y[pid] = T.__ldg(x[pid]) + 1.0
+    # Compile for CUDA and retrieve generated CUDA source
+    kernel = tilelang.compile(main, out_idx=[1], target="cuda")
+    src = kernel.get_kernel_source()
+    print(src)
+    # Assert that codegen uses __ldg on CUDA backend
+    # We look for the intrinsic call with address-of argument
+    assert "__ldg(" in src, "Expected __ldg call in generated CUDA source"
+    assert "__ldg(&" in src or "__ldg(&(" in src, "Expected address-of form in __ldg call"
+if __name__ == "__main__":
+    tilelang.testing.main()
--- a/tilelang/language/__init__.py
+++ b/tilelang/language/__init__.py
@@ -96,6 +96,7 @@ from .customize import (
 )
 from .logical import any_of, all_of  # noqa: F401
 from .builtin import *  # noqa: F401
+from .builtin import __ldg as __ldg  # noqa: F401
 from .utils import index_to_coordinates  # noqa: F401

--- a/tilelang/language/builtin.py
+++ b/tilelang/language/builtin.py
@@ -59,6 +59,35 @@ def create_list_of_mbarrier(*args: Any) -> Call:
        raise TypeError("create_list_of_mbarrier expects a list or one or more arguments.")
+def __ldg(load_or_buf: BufferLoad | tir.Buffer, index: PrimExpr | int | None = None) -> PrimExpr:
+    """Explicitly load via CUDA read-only data cache.
+    Prefer calling with a BufferLoad: `T.__ldg(x[i])` emits `__ldg(&x[i])` on CUDA.
+    On non-CUDA backends, falls back to a regular load.
+    Args:
+        load_or_buf: A `BufferLoad` like `x[i]`, or a `Buffer`.
+        index: Optional index when passing a `Buffer` directly.
+    Returns:
+        PrimExpr: The loaded value.
+    """
+    if isinstance(load_or_buf, BufferLoad):
+        dtype = load_or_buf.dtype
+        return tir.call_intrin(str(dtype), tir.op.Op.get("tl.__ldg"), load_or_buf)
+    if isinstance(load_or_buf, tir.Buffer):
+        if index is None:
+            raise ValueError("T.__ldg(Buffer, index) requires an index when passing a Buffer.")
+        idx = index
+        if isinstance(index, (list, tuple)):
+            if len(index) != 1:
+                raise ValueError("T.__ldg currently supports 1D flattened indices.")
+            idx = index[0]
+        bl = BufferLoad(load_or_buf, [idx])
+        return tir.call_intrin(str(load_or_buf.dtype), tir.op.Op.get("tl.__ldg"), bl)
+    raise TypeError("T.__ldg expects a BufferLoad or a Buffer.")
 def get_mbarrier(*args):
    """Retrieve a memory barrier operation.