"git@developer.sourcefind.cn:OpenDAS/opencompass.git" did not exist on "94eb90569f63800c237f32a35385dff93b43f13a"
Unverified Commit 6f67da84 authored by Lei Wang's avatar Lei Wang Committed by GitHub
Browse files

[Enhancement] Introduce `T.__ldg` (#1414)

* [Enhancement] Add __ldg intrinsic for CUDA read-only cache loads

* Introduced the __ldg intrinsic to enable explicit read-only cached loads from global memory in CUDA.
* Updated the corresponding documentation and added support in both CUDA and HIP code generation.
* Enhanced the Python interface for __ldg to accept BufferLoad and Buffer types, improving usability.

* [Enhancement] Update formatting and linting rules in pyproject.toml; minor test adjustment

* Added new formatting rules in pyproject.toml to enforce consistent code style, including hanging indents and argument splitting.
* Updated test_tilelang_language_intrinsics_codegen.py to improve readability by adding a blank line before the main execution block.
* Refactored error messages in builtin.py for better clarity and consistency, ensuring proper formatting in function definitions and raising ValueErrors.

* lint fix
parent 34632a1b
...@@ -368,5 +368,10 @@ TIR_DEFINE_TL_BUILTIN(warp_reduce_bitor) ...@@ -368,5 +368,10 @@ TIR_DEFINE_TL_BUILTIN(warp_reduce_bitor)
.set_attr<TCallEffectKind>("TCallEffectKind", .set_attr<TCallEffectKind>("TCallEffectKind",
Integer(CallEffectKind::kOpaque)); Integer(CallEffectKind::kOpaque));
// __ldg(BufferLoad | Buffer, idx?) -> value
// Treat as a pure call that returns the loaded value.
TIR_DEFINE_TL_BUILTIN(__ldg).set_num_inputs(-1).set_attr<TCallEffectKind>(
"TCallEffectKind", Integer(CallEffectKind::kPure));
} // namespace tl } // namespace tl
} // namespace tvm } // namespace tvm
...@@ -600,6 +600,24 @@ TVM_DLL const Op &warp_reduce_bitand(); ...@@ -600,6 +600,24 @@ TVM_DLL const Op &warp_reduce_bitand();
*/ */
TVM_DLL const Op &warp_reduce_bitor(); TVM_DLL const Op &warp_reduce_bitor();
/*!
* \brief tilelang intrinsic for CUDA read-only cache load (__ldg).
*
* This op allows users to explicitly request a non-coherent cached load
* from global memory on CUDA by emitting `__ldg(&ptr[idx])` for 32-bit
* element types on supported architectures. It provides a direct way to
* leverage the read-only data cache for performance-sensitive loads when
* the compiler cannot infer `const __restrict__` automatically.
*
* Usage from TVMScript:
* y[i] = T.__ldg(x[i])
*
* The op takes one argument preferred as a BufferLoad identifying the
* source element; alternatively, backends may support passing a Buffer and
* index expression.
*/
TVM_DLL const Op &__ldg();
} // namespace tl } // namespace tl
} // namespace tvm } // namespace tvm
......
...@@ -2354,6 +2354,23 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) { ...@@ -2354,6 +2354,23 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
stream << ": \"l\"((void*)(" << global_buffer << "+" << global_addr stream << ": \"l\"((void*)(" << global_buffer << "+" << global_addr
<< ")), \"r\"((int)" << guard << ")\n"; << ")), \"r\"((int)" << guard << ")\n";
stream << ");\n"; stream << ");\n";
} else if (op->op.same_as(tl::__ldg())) {
// Explicit read-only cached load. Preferred form: __ldg(BufferLoad(...)).
// Fallback form: __ldg(buffer, index)
const BufferLoadNode *bl = nullptr;
if (!op->args.empty()) {
bl = op->args[0].as<BufferLoadNode>();
}
if (bl == nullptr) {
LOG(FATAL) << "T.__ldg expects a BufferLoad as the first argument.";
}
const BufferNode *buffer = bl->buffer.get();
ICHECK_EQ(bl->indices.size(), 1)
<< "T.__ldg currently supports flattened 1D buffer accesses.";
PrimExpr base = bl->indices[0];
// Emit __ldg(&buffer_ref)
auto buffer_ref = this->GetBufferRef(op->dtype, buffer, base);
os << "__ldg(&(" << buffer_ref << "))";
} else if (op->op.same_as(builtin::reinterpret())) { } else if (op->op.same_as(builtin::reinterpret())) {
DataType tgt_dtype = op->dtype; DataType tgt_dtype = op->dtype;
DataType src_dtype = op->args[0]->dtype; DataType src_dtype = op->args[0]->dtype;
......
...@@ -828,6 +828,16 @@ void CodeGenTileLangHIP::VisitExpr_(const CallNode *op, std::ostream &os) { ...@@ -828,6 +828,16 @@ void CodeGenTileLangHIP::VisitExpr_(const CallNode *op, std::ostream &os) {
} else if (op->op.same_as(tl::pack_b16())) { } else if (op->op.same_as(tl::pack_b16())) {
os << "__pack_half2(" << this->PrintExpr(op->args[0]) << ", " os << "__pack_half2(" << this->PrintExpr(op->args[0]) << ", "
<< this->PrintExpr(op->args[1]) << ")"; << this->PrintExpr(op->args[1]) << ")";
} else if (op->op.same_as(tl::__ldg())) {
// HIP fallback: regular load
const BufferLoadNode *bl = op->args[0].as<BufferLoadNode>();
ICHECK(bl) << "T.__ldg expects a BufferLoad as the first argument.";
ICHECK_EQ(bl->indices.size(), 1)
<< "T.__ldg currently supports flattened 1D buffer accesses.";
const BufferNode *buffer = bl->buffer.get();
PrimExpr base = bl->indices[0];
auto buffer_ref = this->GetBufferRef(op->dtype, buffer, base);
os << buffer_ref;
} else if (op->op.same_as(builtin::tvm_fill_fragment())) { } else if (op->op.same_as(builtin::tvm_fill_fragment())) {
need_mma_h_ = true; need_mma_h_ = true;
ICHECK_EQ(op->args.size(), 6U); ICHECK_EQ(op->args.size(), 6U);
......
import tilelang
import tilelang.language as T
import tilelang.testing
@tilelang.testing.requires_cuda
def test_language_ldg_codegen():
N = 128
@T.prim_func
def main(
x: T.Tensor((N,), "float32"),
y: T.Tensor((N,), "float32"),
):
with T.Kernel(N, threads=32) as pid:
# Explicitly request read-only cache load for x[pid]
y[pid] = T.__ldg(x[pid]) + 1.0
# Compile for CUDA and retrieve generated CUDA source
kernel = tilelang.compile(main, out_idx=[1], target="cuda")
src = kernel.get_kernel_source()
print(src)
# Assert that codegen uses __ldg on CUDA backend
# We look for the intrinsic call with address-of argument
assert "__ldg(" in src, "Expected __ldg call in generated CUDA source"
assert "__ldg(&" in src or "__ldg(&(" in src, "Expected address-of form in __ldg call"
if __name__ == "__main__":
tilelang.testing.main()
...@@ -96,6 +96,7 @@ from .customize import ( ...@@ -96,6 +96,7 @@ from .customize import (
) )
from .logical import any_of, all_of # noqa: F401 from .logical import any_of, all_of # noqa: F401
from .builtin import * # noqa: F401 from .builtin import * # noqa: F401
from .builtin import __ldg as __ldg # noqa: F401
from .utils import index_to_coordinates # noqa: F401 from .utils import index_to_coordinates # noqa: F401
......
...@@ -59,6 +59,35 @@ def create_list_of_mbarrier(*args: Any) -> Call: ...@@ -59,6 +59,35 @@ def create_list_of_mbarrier(*args: Any) -> Call:
raise TypeError("create_list_of_mbarrier expects a list or one or more arguments.") raise TypeError("create_list_of_mbarrier expects a list or one or more arguments.")
def __ldg(load_or_buf: BufferLoad | tir.Buffer, index: PrimExpr | int | None = None) -> PrimExpr:
"""Explicitly load via CUDA read-only data cache.
Prefer calling with a BufferLoad: `T.__ldg(x[i])` emits `__ldg(&x[i])` on CUDA.
On non-CUDA backends, falls back to a regular load.
Args:
load_or_buf: A `BufferLoad` like `x[i]`, or a `Buffer`.
index: Optional index when passing a `Buffer` directly.
Returns:
PrimExpr: The loaded value.
"""
if isinstance(load_or_buf, BufferLoad):
dtype = load_or_buf.dtype
return tir.call_intrin(str(dtype), tir.op.Op.get("tl.__ldg"), load_or_buf)
if isinstance(load_or_buf, tir.Buffer):
if index is None:
raise ValueError("T.__ldg(Buffer, index) requires an index when passing a Buffer.")
idx = index
if isinstance(index, (list, tuple)):
if len(index) != 1:
raise ValueError("T.__ldg currently supports 1D flattened indices.")
idx = index[0]
bl = BufferLoad(load_or_buf, [idx])
return tir.call_intrin(str(load_or_buf.dtype), tir.op.Op.get("tl.__ldg"), bl)
raise TypeError("T.__ldg expects a BufferLoad or a Buffer.")
def get_mbarrier(*args): def get_mbarrier(*args):
"""Retrieve a memory barrier operation. """Retrieve a memory barrier operation.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment