"benchmark/git@developer.sourcefind.cn:OpenDAS/tilelang.git" did not exist on "b3d6f03cea2710497a8704c083148813ee0826f3"
Unverified Commit b66f9aae authored by Lei Wang's avatar Lei Wang Committed by GitHub
Browse files

[Math] Dispatch `T.rsqrt(x)` into cuda intrin instead of `1 / T.sqrt(x)` (#781)

* Fix type hint for target_host parameter in compile function to allow None value

* Refactor target handling in compile function to utilize determine_target for improved clarity and consistency

* Update PrintConst function in codegen_cuda.cc to use hexfloat format for bfloat16 and float8/float4 types, while adding scientific notation comments for clarity. This change enhances the representation of floating-point constants in the generated code.

* Refactor PrintType function in codegen_cuda.cc to remove unnecessary failure conditions for floating-point types with lane counts greater than 4. This change simplifies the logic and improves code clarity.

* Enhance benchmark_matmul.py to conditionally print Reference TFlops only if ref_latency is not None. Update param.py to ensure target is converted to string for consistency. Refactor tuner.py to utilize determine_target for improved clarity in target handling.

* Remove automatic commit and push step from AMD and NVIDIA CI workflows to streamline the process and avoid unnecessary commits.

* Add intrin_rule source files to CMakeLists.txt and implement hrsqrt function for half_t in common.h

* lint fix

* remove cmake dep in pyproject as it may lead to different cmake paths in diff stages

* lint fix

* Add cmake dependency to pyproject.toml and improve build logging in setup.py
parent 021e44e3
......@@ -124,6 +124,8 @@ tilelang_file_glob(GLOB TILE_LANG_SRCS
src/target/rt_mod_cpp.cc
# webgpu doesn't have system dependency
src/target/codegen_webgpu.cc
# intrin_rule doesn't have system dependency
src/target/intrin_rule*.cc
)
# Include CUDA source files if CUDA is enabled
......
......@@ -203,6 +203,7 @@ def get_cplus_compiler():
return None
@functools.lru_cache(maxsize=None)
def get_cython_compiler() -> Optional[str]:
"""Return the path to the Cython compiler.
......@@ -238,6 +239,17 @@ def get_cython_compiler() -> Optional[str]:
return None
@functools.lru_cache(maxsize=None)
def get_cmake_path() -> str:
"""Return the path to the CMake compiler.
"""
# found which cmake is used
cmake_path = shutil.which("cmake")
if not os.path.exists(cmake_path):
raise Exception("CMake is not installed, please install it first.")
return cmake_path
def get_system_info():
system = platform.system().lower()
if system == "linux":
......@@ -338,33 +350,6 @@ def update_submodules():
raise RuntimeError("Failed to update submodules") from error
def build_csrc(llvm_config_path):
"""Configures and builds TVM."""
if not os.path.exists("build"):
os.makedirs("build")
os.chdir("build")
# Copy the config.cmake as a baseline
if not os.path.exists("config.cmake"):
shutil.copy("../3rdparty/tvm/cmake/config.cmake", "config.cmake")
# Set LLVM path and enable CUDA or ROCM in config.cmake
with open("config.cmake", "a") as config_file:
config_file.write(f"set(USE_LLVM {llvm_config_path})\n")
if USE_ROCM:
config_file.write(f"set(USE_ROCM {ROCM_HOME})\n")
config_file.write("set(USE_CUDA OFF)\n")
else:
config_file.write(f"set(USE_CUDA {CUDA_HOME})\n")
config_file.write("set(USE_ROCM OFF)\n")
# Run CMake and make
try:
subprocess.check_call(["cmake", ".."])
num_jobs = max(1, int(multiprocessing.cpu_count() * 0.75))
subprocess.check_call(["make", f"-j{num_jobs}"])
except subprocess.CalledProcessError as error:
raise RuntimeError("Failed to build TileLang C Source") from error
def setup_llvm_for_tvm():
"""Downloads and extracts LLVM, then configures TVM to use it."""
# Assume the download_and_extract_llvm function and its dependencies are defined elsewhere in this script
......@@ -627,7 +612,10 @@ class TilelangExtensionBuild(build_ext):
def run(self):
# Check if CMake is installed and accessible by attempting to run 'cmake --version'.
try:
subprocess.check_output(["cmake", "--version"])
cmake_path = get_cmake_path()
if not cmake_path:
raise Exception("CMake is not installed, please install it first.")
subprocess.check_output([cmake_path, "--version"])
except OSError as error:
# If CMake is not found, raise an error.
raise RuntimeError(
......@@ -830,15 +818,25 @@ class TilelangExtensionBuild(build_ext):
else:
print(f"[Config] No changes: {dst_config}")
cmake_path = get_cmake_path()
# Run CMake to configure the project with the given arguments.
if not os.path.exists(build_temp + "/build.ninja"):
subprocess.check_call(["cmake", ext.sourcedir] + cmake_args, cwd=build_temp)
if not os.path.exists(os.path.join(build_temp, "build.ninja")):
logger.info(
f"[CMake] Generating build.ninja: {cmake_path} {ext.sourcedir} {' '.join(cmake_args)}"
)
subprocess.check_call([cmake_path, ext.sourcedir] + cmake_args, cwd=build_temp)
else:
logger.info(f"[CMake] build.ninja already exists in {build_temp}")
# Build the project in "Release" mode with all available CPU cores ("-j").
num_jobs = max(1, int(multiprocessing.cpu_count() * 0.75))
subprocess.check_call(["cmake", "--build", ".", "--config", "Release", "-j",
str(num_jobs)],
cwd=build_temp)
logger.info(
f"[Build] Using {num_jobs} jobs | cmake: {cmake_path} (exists: {os.path.exists(cmake_path)}) | build dir: {build_temp}"
)
subprocess.check_call(
[cmake_path, "--build", ".", "--config", "Release", "-j",
str(num_jobs)],
cwd=build_temp)
setup(
......
/*!
* \file intrin_rule_cuda.cc
* \brief CUDA intrinsic rules.
*/
#include <tvm/tir/builtin.h>
#include <tvm/tir/op_attr_types.h>
#include "target/intrin_rule.h"
namespace tvm {
namespace codegen {
namespace intrin {
// Add float suffix to the intrinsics, CUDA fast math.
using tir::FLowerIntrinsic;
struct CUDAMath {
std::string operator()(DataType t, std::string name) const {
if (t.is_float()) {
switch (t.bits()) {
case 64:
return name;
case 32:
return name + 'f';
case 16: {
if (name == "fabs") {
return "__habs";
} else if (name == "round") {
return "hrint";
} else {
return "h" + name;
}
}
default:
return "";
}
} else if (t.is_bfloat16()) {
if (name == "fabs") {
return "__habs";
} else if (name == "round") {
return "hrint";
} else {
return "h" + name;
}
} else if (t.is_int() || t.is_uint()) {
switch (t.bits()) {
case 32:
return "__" + name;
case 64:
return "__" + name + "ll";
default:
return "";
}
}
return "";
}
};
struct CUDAFastMath : public CUDAMath {
std::string operator()(DataType t, std::string name) const {
if (t.is_float() && t.bits() == 32) {
return "__" + name + 'f';
} else {
return CUDAMath::operator()(t, name);
}
return "";
}
};
struct CUDAFastMathTan : public CUDAMath {
std::string operator()(DataType t, std::string name) const {
if (t.is_float()) {
switch (t.bits()) {
case 64:
return name;
// `__tanf` seems to produce some values too deviant from numpy tan
// version. So, let's use just `tanf` instead.
case 32:
return name + 'f';
case 16:
return 'h' + name;
default:
return "";
}
}
return "";
}
};
struct CUDAPopcount {
std::string operator()(DataType t, std::string name) const {
if (t.is_uint()) {
switch (t.bits()) {
case 32:
return "__popc";
case 64:
return "__popcll";
default:
return "";
}
}
return "";
}
};
struct CUDAWarpIntrinsic {
const Op operator()(DataType t, const Op &orig_op) const {
if (orig_op.same_as(builtin::tvm_warp_shuffle())) {
return Op::Get("tir.cuda.__shfl_sync");
} else if (orig_op.same_as(builtin::tvm_warp_shuffle_up())) {
return Op::Get("tir.cuda.__shfl_up_sync");
} else {
ICHECK(orig_op.same_as(builtin::tvm_warp_shuffle_down()));
return Op::Get("tir.cuda.__shfl_down_sync");
}
}
};
static PrimExpr DispatchCUDAWarpActiveMask(const PrimExpr &e) {
const CallNode *call = e.as<CallNode>();
return Call(call->dtype, Op::Get("tir.cuda.__activemask"), call->args);
}
template <typename T> static PrimExpr DispatchCUDAShuffle(const PrimExpr &e) {
const CallNode *call = e.as<CallNode>();
ICHECK(call != nullptr);
ICHECK_EQ(call->args.size(), 5); // mask, value, warp_id, width, warp_size
Array<PrimExpr> cuda_args{
{call->args[0], call->args[1], call->args[2], call->args[3]}};
return Call(call->dtype, T()(call->dtype, Downcast<Op>(call->op)), cuda_args);
}
TVM_REGISTER_OP("tir.rsqrt")
.set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic",
DispatchPureExtern<CUDAMath>);
} // namespace intrin
} // namespace codegen
} // namespace tvm
......@@ -55,6 +55,11 @@ TL_PATCH TL_DEVICE half_t __habs(const half_t x) {
return half_t(__habs(x.to_half()));
}
// hrsqrt function for half_t
TL_PATCH TL_DEVICE half_t hrsqrt(const half_t x) {
return half_t(hrsqrt(x.to_half()));
}
// Pack two half values.
TL_DEVICE unsigned __pack_half2(const half x, const half y) {
unsigned v0 = *((unsigned short *)&x);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment