import tilelang
import tilelang.language as T
import tilelang.testing


@tilelang.testing.requires_cuda
def test_language_ldg_codegen():
    N = 128

    @T.prim_func
    def main(
        x: T.Tensor((N,), T.float32),
        y: T.Tensor((N,), T.float32),
    ):
        with T.Kernel(N, threads=32) as pid:
            # Explicitly request read-only cache load for x[pid]
            y[pid] = T.__ldg(x[pid]) + 1.0

    # Compile for CUDA and retrieve generated CUDA source
    kernel = tilelang.compile(main, out_idx=[1], target="cuda")
    src = kernel.get_kernel_source()
    print(src)
    # Assert that codegen uses __ldg on CUDA backend
    # We look for the intrinsic call with address-of argument
    assert "__ldg(" in src, "Expected __ldg call in generated CUDA source"
    assert "__ldg(&" in src or "__ldg(&(" in src, "Expected address-of form in __ldg call"


if __name__ == "__main__":
    tilelang.testing.main()