"vscode:/vscode.git/clone" did not exist on "07af8343f68aa146bcceb1d297d78c64b936b1ac"
Commit 5501b31c authored by Wenhao Xie's avatar Wenhao Xie Committed by LeiWang1999
Browse files

[Doc] Python API docs generation (#278)

* fix bug

* update performance.py

* update python api docs

* test workflow

* fix dependency

* fix bug

* fix

* update correct git config

* test workflow

* clear cache

* lint fix

* fix exclude path
parent 83412458
...@@ -18,6 +18,7 @@ jobs: ...@@ -18,6 +18,7 @@ jobs:
- name: Checkout repository - name: Checkout repository
uses: actions/checkout@v2 uses: actions/checkout@v2
with: with:
ref: refs/pull/${{ github.event.issue.number }}/merge
fetch-depth: 0 fetch-depth: 0
- name: Set up Python - name: Set up Python
......
...@@ -12,18 +12,20 @@ permissions: ...@@ -12,18 +12,20 @@ permissions:
jobs: jobs:
docs: docs:
if: ${{ github.event.pull_request.merged == true && github.event.pull_request.base.ref == 'main' }} || ${{ github.event_name == 'workflow_dispatch' }} if: ${{ github.event.pull_request.merged == true && github.event.pull_request.base.ref == 'main' }} || ${{ github.event_name == 'workflow_dispatch' }}
runs-on: ubuntu-latest runs-on: [self-hosted]
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
- uses: actions/setup-python@v5 - uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Build docs - name: Build docs
run: | run: |
chmod +x ./maint/scripts/build_docs.sh chmod +x ./maint/scripts/build_docs.sh
./maint/scripts/build_docs.sh ./maint/scripts/build_docs.sh
- name: Configure git - name: Configure git
run: | run: |
git config --global user.email "tilelang@outlook.com" git config --local user.name "github-actions[bot]"
git config --global user.name "GitHub Actions" git config --local user.email "github-actions[bot]@users.noreply.github.com"
- name: Push to another repo - name: Push to another repo
env: env:
TARGET_REPO: ${{ secrets.TARGET_REPO }} TARGET_REPO: ${{ secrets.TARGET_REPO }}
......
_build/ _build/
api/
\ No newline at end of file
...@@ -17,4 +17,6 @@ help: ...@@ -17,4 +17,6 @@ help:
# Catch-all target: route all unknown targets to Sphinx using the new # Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile %: Makefile
rm -rf api/
sphinx-apidoc --separate -H "Python API" -o ./api/ ../tilelang "../tilelang/language/ast*" "../tilelang/language/parser*" "../tilelang/libinfo*" "../tilelang/version*"
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
...@@ -8,7 +8,8 @@ import sys ...@@ -8,7 +8,8 @@ import sys
sys.path.insert(0, os.path.abspath("../tilelang")) sys.path.insert(0, os.path.abspath("../tilelang"))
sys.path.insert(0, os.path.abspath("../")) sys.path.insert(0, os.path.abspath("../"))
autodoc_mock_imports = ["torch"]
autodoc_mock_imports = ["torch", "tilelang.language.ast", "tilelang.language.parser"]
# General information about the project. # General information about the project.
project = "Tile Language <br>" project = "Tile Language <br>"
...@@ -46,11 +47,11 @@ myst_enable_extensions = [ ...@@ -46,11 +47,11 @@ myst_enable_extensions = [
redirects = {"get_started/try_out": "../index.html#getting-started"} redirects = {"get_started/try_out": "../index.html#getting-started"}
source_suffix = [".md"] source_suffix = [".md", ".rst"]
language = "en" language = "en"
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "README.md"] exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "README.md", "**/*libinfo*", "**/*version*"]
# The name of the Pygments (syntax highlighting) style to use. # The name of the Pygments (syntax highlighting) style to use.
pygments_style = "sphinx" pygments_style = "sphinx"
......
...@@ -53,6 +53,12 @@ language_ref/primitives ...@@ -53,6 +53,12 @@ language_ref/primitives
language_ref/tilelibrary language_ref/tilelibrary
::: :::
:::{toctree}
:maxdepth: 1
:caption: API Reference
api/modules
:::
:::{toctree} :::{toctree}
:maxdepth: 1 :maxdepth: 1
......
cd docs python -m venv .venv
source .venv/bin/activate
pip install -r requirements.txt python -m pip install --upgrade pip
python -m pip install -r requirements-test.txt
python -m pip install -r docs/requirements.txt
python -m pip install -e .
cd docs
make html make html
cp CNAME _build/html/ cp CNAME _build/html/
...@@ -2,6 +2,11 @@ import subprocess ...@@ -2,6 +2,11 @@ import subprocess
import re import re
from tabulate import tabulate from tabulate import tabulate
import os
env = os.environ.copy()
env["TILELANG_CLEAR_CACHE"] = "1"
def parse_output(output): def parse_output(output):
data = {} data = {}
...@@ -23,12 +28,14 @@ def parse_output(output): ...@@ -23,12 +28,14 @@ def parse_output(output):
output_v1 = subprocess.run(['./tl/bin/python', './maint/scripts/performance.py'], output_v1 = subprocess.run(['./tl/bin/python', './maint/scripts/performance.py'],
capture_output=True, capture_output=True,
text=True).stdout text=True,
env=env).stdout
data_v1 = parse_output(output_v1) data_v1 = parse_output(output_v1)
output_v2 = subprocess.run(['./tll/bin/python', './maint/scripts/performance.py'], output_v2 = subprocess.run(['./tll/bin/python', './maint/scripts/performance.py'],
capture_output=True, capture_output=True,
text=True).stdout text=True,
env=env).stdout
data_v2 = parse_output(output_v2) data_v2 = parse_output(output_v2)
table = [[ table = [[
......
import argparse import argparse
import itertools
import logging
import tilelang as tl import tilelang as tl
import tilelang.language as T import tilelang.language as T
from tilelang.autotuner import autotune, jit from tilelang.autotuner import AutoTuner
# Configure logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
def ref_program(A, B): def ref_program(A, B):
"""
A reference matrix multiplication program, used to compare performance.
Parameters
----------
A : numpy.ndarray
The matrix with shape (M, K).
B : numpy.ndarray
The matrix with shape (N, K).
Returns
-------
np.ndarray
The result of A @ B.T, shape (M, N).
"""
return A @ B.T return A @ B.T
def get_configs(M, N, K, with_roller=False): def get_configs():
""" configs = [{
Generate a list of configuration dictionaries that will be used for tuning. "block_M": 128,
"block_N": 128,
Parameters "block_K": 64,
---------- "num_stages": 2,
with_roller : bool "thread_num": 256,
Whether to enable bitblas roller to deduce search spaces "enable_rasteration": True, # keep param name for backward-compat
}]
Returns
-------
list of dict
Each configuration dict includes various block sizes, pipeline stages,
thread numbers, and other parameters to explore during autotuning.
"""
if with_roller:
from tilelang.carver.template import MatmulTemplate
from tilelang.carver.arch import CUDA
from tilelang.carver.roller.rasterization import NoRasterization
arch = CUDA("cuda")
topk = 10
carve_template = MatmulTemplate(
M=M,
N=N,
K=K,
in_dtype="float16",
out_dtype="float16",
accum_dtype="float",
).with_arch(arch)
func = carve_template.equivalent_function()
assert func is not None, "Function is None"
roller_hints = carve_template.recommend_hints(topk=topk)
if roller_hints is None:
raise ValueError("No Roller Hints Found for TensorCore Scheduling")
configs = []
for hint in roller_hints:
config = {}
block_m, block_n = hint.block
warp_m, warp_n = hint.warp
# block_rows, block_cols represents warp partitioning
block_rows, block_cols = block_m // warp_m, block_n // warp_n
config["block_M"] = block_m
config["block_N"] = block_n
config["block_K"] = hint.rstep[0]
config["num_stages"] = hint.pipeline_stage
config["thread_num"] = block_rows * block_cols * 32
config["policy"] = T.GemmWarpPolicy.from_warp_partition(block_rows, block_cols)
config["enable_rasteration"] = hint.rasterization_plan is not NoRasterization
configs.append(config)
for config in configs:
print(config)
else:
block_M = [128]
block_N = [128]
block_K = [64]
num_stages = [2]
thread_num = [128]
policy = [T.GemmWarpPolicy.Square]
enable_rasterization = [True]
_configs = list(
itertools.product(
block_M,
block_N,
block_K,
num_stages,
thread_num,
policy,
enable_rasterization,
))
configs = [
{
"block_M": c[0],
"block_N": c[1],
"block_K": c[2],
"num_stages": c[3],
"thread_num": c[4],
"policy": c[5],
"enable_rasteration": c[6], # keep param name for backward-compat
} for c in _configs
]
return configs return configs
def matmul(M, N, K, with_roller): def run(M, N, K):
"""
Create an autotuned matrix multiplication kernel for matrices of shape:
- A: (M, K)
- B: (N, K)
- C: (M, N)
Parameters
----------
M : int
The dimension M of the matrix multiplication.
N : int
The dimension N of the matrix multiplication.
K : int
The dimension K of the matrix multiplication.
Returns
-------
(best_latency, best_config, ref_latency)
best_latency : float
The best latency found among the tuned configurations.
best_config : dict
The parameter configuration that yielded best_latency.
ref_latency : float
The baseline latency of the reference program (for computing speedup).
"""
# Decorate the kernel with autotune & jit, specifying:
# - Tuning config list
# - Profiling keys
# - Warmup and repetition counts for better measurement
# - A reference program for correctness verification
# - The "tvm" profiler backend
# - HIP as the compilation target (modify as needed for your hardware)
@autotune(
configs=get_configs(M, N, K, with_roller),
keys=[
"block_M",
"block_N",
"block_K",
"num_stages",
"thread_num",
"policy",
"enable_rasteration",
],
warmup=3,
rep=20,
)
@jit(
out_idx=[2],
supply_type=tl.TensorSupplyType.Integer,
ref_prog=ref_program,
skip_check=True,
target="auto",
)
def kernel( def kernel(
block_M=None, block_M=None,
block_N=None, block_N=None,
block_K=None, block_K=None,
num_stages=None, num_stages=None,
thread_num=None, thread_num=None,
policy=None,
enable_rasteration=None, enable_rasteration=None,
): ):
"""
The actual kernel to compute C = A @ B^T.
Parameters
----------
block_M : int
Block size in M dimension.
block_N : int
Block size in N dimension.
block_K : int
Block size in K dimension.
num_stages : int
Number of pipelined stages (for asynchronous load).
thread_num : int
Number of threads to use per block.
enable_rasteration : bool
Whether to enable rasterization (swizzling) optimization.
k_pack : int
K dimension packing factor to improve memory coalescing.
Returns
-------
Function
A TVM Tensor Language function (T.prim_func) that computes matmul.
"""
# Use half-precision for input data to reduce memory bandwidth,
# accumulate in float for better numerical accuracy
dtype = "float16" dtype = "float16"
accum_dtype = "float" accum_dtype = "float"
...@@ -221,58 +39,37 @@ def matmul(M, N, K, with_roller): ...@@ -221,58 +39,37 @@ def matmul(M, N, K, with_roller):
B: T.Tensor((N, K), dtype), B: T.Tensor((N, K), dtype),
C: T.Tensor((M, N), dtype), C: T.Tensor((M, N), dtype),
): ):
"""
The compiled TVM function for block-level matrix multiplication.
- We divide the entire (M, N) domain into blocks of shape
(block_M, block_N).
- Each block has its own allocated shared memory for sub-blocks
of A and B.
- The partial results go into C_local, and then we copy them back
to global memory C.
"""
# Bind x-dimension to block index in N,
# y-dimension to block index in M.
with T.Kernel( with T.Kernel(
T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by): T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
# Allocate shared memory for A sub-block of shape (block_M, block_K)
A_shared = T.alloc_shared((block_M, block_K), dtype) A_shared = T.alloc_shared((block_M, block_K), dtype)
# Allocate shared memory for B sub-block of shape (block_N, block_K)
B_shared = T.alloc_shared((block_N, block_K), dtype) B_shared = T.alloc_shared((block_N, block_K), dtype)
# Allocate a local fragment for intermediate accumulation
C_local = T.alloc_fragment((block_M, block_N), accum_dtype) C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
# Allocate a shared memory for C sub-block of shape (block_M, block_N)
C_shared = T.alloc_shared((block_M, block_N), dtype) C_shared = T.alloc_shared((block_M, block_N), dtype)
# Enable (or disable) swizzling optimization
T.use_swizzle(panel_size=10, enable=enable_rasteration) T.use_swizzle(panel_size=10, enable=enable_rasteration)
# Clear out the accumulation buffer
T.clear(C_local) T.clear(C_local)
# Loop over sub-blocks in K dimension, pipelined by num_stages
for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages): for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
# Load a sub-block of A from global memory into A_shared
T.copy(A[by * block_M, k * block_K], A_shared) T.copy(A[by * block_M, k * block_K], A_shared)
# Load a sub-block of B from global memory into B_shared
T.copy(B[bx * block_N, k * block_K], B_shared) T.copy(B[bx * block_N, k * block_K], B_shared)
# Perform a partial matrix multiplication:
# C_local += A_shared @ B_shared^T
T.gemm( T.gemm(
A_shared, A_shared,
B_shared, B_shared,
C_local, C_local,
transpose_B=True, transpose_B=True,
policy=policy,
) )
# Write back the results from C_local to the global memory C
T.copy(C_local, C_shared) T.copy(C_local, C_shared)
T.copy(C_shared, C[by * block_M, bx * block_N]) T.copy(C_shared, C[by * block_M, bx * block_N])
return main return main
return kernel() autotuner = AutoTuner.from_kernel(
kernel=kernel, configs=get_configs()).set_compile_args(
out_idx=[-1],
supply_type=tl.TensorSupplyType.Integer,
ref_prog=ref_program,
skip_check=False,
target="auto",
)
return autotuner.run(warmup=3, rep=20)
if __name__ == "__main__": if __name__ == "__main__":
...@@ -281,25 +78,17 @@ if __name__ == "__main__": ...@@ -281,25 +78,17 @@ if __name__ == "__main__":
parser.add_argument("--m", type=int, default=16384, help="Matrix dimension M") parser.add_argument("--m", type=int, default=16384, help="Matrix dimension M")
parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N") parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K") parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
parser.add_argument(
"--with_roller",
action="store_true",
help="Whether to enable BitBLAS roller for search space",
)
args = parser.parse_args() args = parser.parse_args()
M, N, K = args.m, args.n, args.k M, N, K = args.m, args.n, args.k
with_roller = args.with_roller
# Compute total floating-point operations to measure throughput # Compute total floating-point operations to measure throughput
total_flops = 2 * M * N * K total_flops = 2 * M * N * K
# matmul(...) returns (best_latency, best_config, ref_latency) result = run(M, N, K)
best_latency, best_config, ref_latency = matmul(M, N, K, with_roller)
# Print out the benchmark results print(f"Latency: {result.latency}")
print(f"Latency: {best_latency}") print(f"TFlops: {total_flops / result.latency * 1e-9:.3f}")
print(f"TFlops: {total_flops / best_latency * 1e-9:.3f}") print(f"Config: {result.config}")
print(f"Config: {best_config}")
print(f"Reference TFlops: {total_flops / ref_latency * 1e-9:.3f}") print(f"Reference TFlops: {total_flops / result.ref_latency * 1e-9:.3f}")
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment