[Doc] Python API docs generation (#278)

* fix bug * update performance.py * update python api docs * test workflow * fix dependency * fix bug * fix * update correct git config * test workflow * clear cache * lint fix * fix exclude path

[Doc] Python API docs generation (#278)
* fix bug * update performance.py * update python api docs * test workflow * fix dependency * fix bug * fix * update correct git config * test workflow * clear cache * lint fix * fix exclude path
5501b31c · Wenhao Xie · LeiWang1999 · 83412458 · 5501b31c · 5501b31c
Commit 5501b31c authored Mar 27, 2025 by Wenhao Xie Committed by LeiWang1999 Mar 27, 2025
9 changed files
--- a/.github/workflows/bot.yml
+++ b/.github/workflows/bot.yml
@@ -18,6 +18,7 @@ jobs:
      - name: Checkout repository
        uses: actions/checkout@v2
        with:
+          ref: refs/pull/${{ github.event.issue.number }}/merge
          fetch-depth: 0

      - name: Set up Python

--- a/.github/workflows/publish_docs.yml
+++ b/.github/workflows/publish_docs.yml
@@ -12,18 +12,20 @@ permissions:
 jobs:
  docs:
    if: ${{ github.event.pull_request.merged == true && github.event.pull_request.base.ref == 'main' }} || ${{ github.event_name == 'workflow_dispatch' }}
-    runs-on: ubuntu-latest
+    runs-on: [self-hosted]
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
      - name: Build docs
        run: |
          chmod +x ./maint/scripts/build_docs.sh
          ./maint/scripts/build_docs.sh
      - name: Configure git
        run: |
-          git config --global user.email "tilelang@outlook.com"
-          git config --global user.name "GitHub Actions"
+          git config --local user.name "github-actions[bot]"
+          git config --local user.email "github-actions[bot]@users.noreply.github.com"
      - name: Push to another repo
        env:
          TARGET_REPO: ${{ secrets.TARGET_REPO }}

--- a/docs/.gitignore
+++ b/docs/.gitignore
 _build/
+api/
\ No newline at end of file
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -17,4 +17,6 @@ help:
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
+	rm -rf api/
+	sphinx-apidoc --separate -H "Python API" -o ./api/ ../tilelang "../tilelang/language/ast*" "../tilelang/language/parser*" "../tilelang/libinfo*" "../tilelang/version*"
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -8,7 +8,8 @@ import sys

 sys.path.insert(0, os.path.abspath("../tilelang"))
 sys.path.insert(0, os.path.abspath("../"))
-autodoc_mock_imports = ["torch"]
+
+autodoc_mock_imports = ["torch", "tilelang.language.ast", "tilelang.language.parser"]

 # General information about the project.
 project = "Tile Language <br>"
@@ -46,11 +47,11 @@ myst_enable_extensions = [

 redirects = {"get_started/try_out": "../index.html#getting-started"}

-source_suffix = [".md"]
+source_suffix = [".md", ".rst"]

 language = "en"

-exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "README.md"]
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "README.md", "**/*libinfo*", "**/*version*"]

 # The name of the Pygments (syntax highlighting) style to use.
 pygments_style = "sphinx"

--- a/docs/index.md
+++ b/docs/index.md
@@ -53,6 +53,12 @@ language_ref/primitives
 language_ref/tilelibrary
 ::: 

+:::{toctree}
+:maxdepth: 1
+:caption: API Reference
+
+api/modules
+:::

 :::{toctree}
 :maxdepth: 1

--- a/maint/scripts/build_docs.sh
+++ b/maint/scripts/build_docs.sh
-cd docs
-
-pip install -r requirements.txt
+python -m venv .venv
+source .venv/bin/activate
+python -m pip install --upgrade pip
+python -m pip install -r requirements-test.txt
+python -m pip install -r docs/requirements.txt
+python -m pip install -e .

+cd docs
 make html

 cp CNAME _build/html/
--- a/maint/scripts/ci_performance.py
+++ b/maint/scripts/ci_performance.py
@@ -2,6 +2,11 @@ import subprocess
 import re
 from tabulate import tabulate

+import os
+
+env = os.environ.copy()
+env["TILELANG_CLEAR_CACHE"] = "1"
+

 def parse_output(output):
    data = {}
@@ -23,12 +28,14 @@ def parse_output(output):

 output_v1 = subprocess.run(['./tl/bin/python', './maint/scripts/performance.py'],
                           capture_output=True,
-                           text=True).stdout
+                           text=True,
+                           env=env).stdout
 data_v1 = parse_output(output_v1)

 output_v2 = subprocess.run(['./tll/bin/python', './maint/scripts/performance.py'],
                           capture_output=True,
-                           text=True).stdout
+                           text=True,
+                           env=env).stdout
 data_v2 = parse_output(output_v2)

 table = [[

--- a/maint/scripts/performance.py
+++ b/maint/scripts/performance.py
 import argparse
-import itertools
-import logging
-
 import tilelang as tl
 import tilelang.language as T
-from tilelang.autotuner import autotune, jit
-
-# Configure logger
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.DEBUG)
+from tilelang.autotuner import AutoTuner


 def ref_program(A, B):
-    """
-    A reference matrix multiplication program, used to compare performance.
-
-    Parameters
-    ----------
-    A : numpy.ndarray
-        The matrix with shape (M, K).
-    B : numpy.ndarray
-        The matrix with shape (N, K).
-
-    Returns
-    -------
-    np.ndarray
-        The result of A @ B.T, shape (M, N).
-    """
    return A @ B.T


-def get_configs(M, N, K, with_roller=False):
-    """
-    Generate a list of configuration dictionaries that will be used for tuning.
-    
-    Parameters
-    ----------
-    with_roller : bool
-        Whether to enable bitblas roller to deduce search spaces
-
-    Returns
-    -------
-    list of dict
-        Each configuration dict includes various block sizes, pipeline stages,
-        thread numbers, and other parameters to explore during autotuning.
-    """
-    if with_roller:
-        from tilelang.carver.template import MatmulTemplate
-        from tilelang.carver.arch import CUDA
-        from tilelang.carver.roller.rasterization import NoRasterization
-        arch = CUDA("cuda")
-        topk = 10
-
-        carve_template = MatmulTemplate(
-            M=M,
-            N=N,
-            K=K,
-            in_dtype="float16",
-            out_dtype="float16",
-            accum_dtype="float",
-        ).with_arch(arch)
-
-        func = carve_template.equivalent_function()
-        assert func is not None, "Function is None"
-
-        roller_hints = carve_template.recommend_hints(topk=topk)
-
-        if roller_hints is None:
-            raise ValueError("No Roller Hints Found for TensorCore Scheduling")
-
-        configs = []
-        for hint in roller_hints:
-            config = {}
-            block_m, block_n = hint.block
-            warp_m, warp_n = hint.warp
-            # block_rows, block_cols represents warp partitioning
-            block_rows, block_cols = block_m // warp_m, block_n // warp_n
-            config["block_M"] = block_m
-            config["block_N"] = block_n
-            config["block_K"] = hint.rstep[0]
-            config["num_stages"] = hint.pipeline_stage
-            config["thread_num"] = block_rows * block_cols * 32
-            config["policy"] = T.GemmWarpPolicy.from_warp_partition(block_rows, block_cols)
-            config["enable_rasteration"] = hint.rasterization_plan is not NoRasterization
-            configs.append(config)
-        for config in configs:
-            print(config)
-    else:
-
-        block_M = [128]
-        block_N = [128]
-        block_K = [64]
-        num_stages = [2]
-        thread_num = [128]
-        policy = [T.GemmWarpPolicy.Square]
-        enable_rasterization = [True]
-        _configs = list(
-            itertools.product(
-                block_M,
-                block_N,
-                block_K,
-                num_stages,
-                thread_num,
-                policy,
-                enable_rasterization,
-            ))
-
-        configs = [
-            {
-                "block_M": c[0],
-                "block_N": c[1],
-                "block_K": c[2],
-                "num_stages": c[3],
-                "thread_num": c[4],
-                "policy": c[5],
-                "enable_rasteration": c[6],  # keep param name for backward-compat
-            } for c in _configs
-        ]
+def get_configs():
+    configs = [{
+        "block_M": 128,
+        "block_N": 128,
+        "block_K": 64,
+        "num_stages": 2,
+        "thread_num": 256,
+        "enable_rasteration": True,  # keep param name for backward-compat
+    }]
    return configs


-def matmul(M, N, K, with_roller):
-    """
-    Create an autotuned matrix multiplication kernel for matrices of shape:
-      - A: (M, K)
-      - B: (N, K)
-      - C: (M, N)
+def run(M, N, K):

-    Parameters
-    ----------
-    M : int
-        The dimension M of the matrix multiplication.
-    N : int
-        The dimension N of the matrix multiplication.
-    K : int
-        The dimension K of the matrix multiplication.
-
-    Returns
-    -------
-    (best_latency, best_config, ref_latency)
-        best_latency : float
-            The best latency found among the tuned configurations.
-        best_config : dict
-            The parameter configuration that yielded best_latency.
-        ref_latency : float
-            The baseline latency of the reference program (for computing speedup).
-    """
-
-    # Decorate the kernel with autotune & jit, specifying:
-    #  - Tuning config list
-    #  - Profiling keys
-    #  - Warmup and repetition counts for better measurement
-    #  - A reference program for correctness verification
-    #  - The "tvm" profiler backend
-    #  - HIP as the compilation target (modify as needed for your hardware)
-
-    @autotune(
-        configs=get_configs(M, N, K, with_roller),
-        keys=[
-            "block_M",
-            "block_N",
-            "block_K",
-            "num_stages",
-            "thread_num",
-            "policy",
-            "enable_rasteration",
-        ],
-        warmup=3,
-        rep=20,
-    )
-    @jit(
-        out_idx=[2],
-        supply_type=tl.TensorSupplyType.Integer,
-        ref_prog=ref_program,
-        skip_check=True,
-        target="auto",
-    )
    def kernel(
        block_M=None,
        block_N=None,
        block_K=None,
        num_stages=None,
        thread_num=None,
-        policy=None,
        enable_rasteration=None,
    ):
-        """
-        The actual kernel to compute C = A @ B^T.
-
-        Parameters
-        ----------
-        block_M : int
-            Block size in M dimension.
-        block_N : int
-            Block size in N dimension.
-        block_K : int
-            Block size in K dimension.
-        num_stages : int
-            Number of pipelined stages (for asynchronous load).
-        thread_num : int
-            Number of threads to use per block.
-        enable_rasteration : bool
-            Whether to enable rasterization (swizzling) optimization.
-        k_pack : int
-            K dimension packing factor to improve memory coalescing.
-
-        Returns
-        -------
-        Function
-            A TVM Tensor Language function (T.prim_func) that computes matmul.
-        """
-        # Use half-precision for input data to reduce memory bandwidth,
-        # accumulate in float for better numerical accuracy
        dtype = "float16"
        accum_dtype = "float"

@@ -221,58 +39,37 @@ def matmul(M, N, K, with_roller):
                B: T.Tensor((N, K), dtype),
                C: T.Tensor((M, N), dtype),
        ):
-            """
-            The compiled TVM function for block-level matrix multiplication.
-
-            - We divide the entire (M, N) domain into blocks of shape
-              (block_M, block_N).
-            - Each block has its own allocated shared memory for sub-blocks
-              of A and B.
-            - The partial results go into C_local, and then we copy them back
-              to global memory C.
-            """
-            # Bind x-dimension to block index in N,
-            #     y-dimension to block index in M.
            with T.Kernel(
                    T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
-
-                # Allocate shared memory for A sub-block of shape (block_M, block_K)
                A_shared = T.alloc_shared((block_M, block_K), dtype)
-                # Allocate shared memory for B sub-block of shape (block_N, block_K)
                B_shared = T.alloc_shared((block_N, block_K), dtype)
-                # Allocate a local fragment for intermediate accumulation
                C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-                # Allocate a shared memory for C sub-block of shape (block_M, block_N)
                C_shared = T.alloc_shared((block_M, block_N), dtype)
-
-                # Enable (or disable) swizzling optimization
                T.use_swizzle(panel_size=10, enable=enable_rasteration)
-
-                # Clear out the accumulation buffer
                T.clear(C_local)
-
-                # Loop over sub-blocks in K dimension, pipelined by num_stages
                for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                    # Load a sub-block of A from global memory into A_shared
                    T.copy(A[by * block_M, k * block_K], A_shared)
-                    # Load a sub-block of B from global memory into B_shared
                    T.copy(B[bx * block_N, k * block_K], B_shared)
-                    # Perform a partial matrix multiplication:
-                    #   C_local += A_shared @ B_shared^T
                    T.gemm(
                        A_shared,
                        B_shared,
                        C_local,
                        transpose_B=True,
-                        policy=policy,
                    )
-                # Write back the results from C_local to the global memory C
                T.copy(C_local, C_shared)
                T.copy(C_shared, C[by * block_M, bx * block_N])

        return main

-    return kernel()
+    autotuner = AutoTuner.from_kernel(
+        kernel=kernel, configs=get_configs()).set_compile_args(
+            out_idx=[-1],
+            supply_type=tl.TensorSupplyType.Integer,
+            ref_prog=ref_program,
+            skip_check=False,
+            target="auto",
+        )
+    return autotuner.run(warmup=3, rep=20)


 if __name__ == "__main__":
@@ -281,25 +78,17 @@ if __name__ == "__main__":
    parser.add_argument("--m", type=int, default=16384, help="Matrix dimension M")
    parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
    parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
-    parser.add_argument(
-        "--with_roller",
-        action="store_true",
-        help="Whether to enable BitBLAS roller for search space",
-    )
    args = parser.parse_args()

    M, N, K = args.m, args.n, args.k
-    with_roller = args.with_roller

    # Compute total floating-point operations to measure throughput
    total_flops = 2 * M * N * K

-    # matmul(...) returns (best_latency, best_config, ref_latency)
-    best_latency, best_config, ref_latency = matmul(M, N, K, with_roller)
+    result = run(M, N, K)

-    # Print out the benchmark results
-    print(f"Latency: {best_latency}")
-    print(f"TFlops: {total_flops / best_latency * 1e-9:.3f}")
-    print(f"Config: {best_config}")
+    print(f"Latency: {result.latency}")
+    print(f"TFlops: {total_flops / result.latency * 1e-9:.3f}")
+    print(f"Config: {result.config}")

-    print(f"Reference TFlops: {total_flops / ref_latency * 1e-9:.3f}")
\ No newline at end of file
+    print(f"Reference TFlops: {total_flops / result.ref_latency * 1e-9:.3f}")