[Math] Dispatch `T.rsqrt(x)` into cuda intrin instead of `1 / T.sqrt(x)` (#781)

* Fix type hint for target_host parameter in compile function to allow None value * Refactor target handling in compile function to utilize determine_target for improved clarity and consistency * Update PrintConst function in codegen_cuda.cc to use hexfloat format for bfloat16 and float8/float4 types, while adding scientific notation comments for clarity. This change enhances the representation of floating-point constants in the generated code. * Refactor PrintType function in codegen_cuda.cc to remove unnecessary failure conditions for floating-point types with lane counts greater than 4. This change simplifies the logic and improves code clarity. * Enhance benchmark_matmul.py to conditionally print Reference TFlops only if ref_latency is not None. Update param.py to ensure target is converted to string for consistency. Refactor tuner.py to utilize determine_target for improved clarity in target handling. * Remove automatic commit and push step from AMD and NVIDIA CI workflows to streamline the process and avoid unnecessary commits. * Add intrin_rule source files to CMakeLists.txt and implement hrsqrt function for half_t in common.h * lint fix * remove cmake dep in pyproject as it may lead to different cmake paths in diff stages * lint fix * Add cmake dependency to pyproject.toml and improve build logging in setup.py

[Math] Dispatch `T.rsqrt(x)` into cuda intrin instead of `1 / T.sqrt(x)` (#781)
* Fix type hint for target_host parameter in compile function to allow None value * Refactor target handling in compile function to utilize determine_target for improved clarity and consistency * Update PrintConst function in codegen_cuda.cc to use hexfloat format for bfloat16 and float8/float4 types, while adding scientific notation comments for clarity. This change enhances the representation of floating-point constants in the generated code. * Refactor PrintType function in codegen_cuda.cc to remove unnecessary failure conditions for floating-point types with lane counts greater than 4. This change simplifies the logic and improves code clarity. * Enhance benchmark_matmul.py to conditionally print Reference TFlops only if ref_latency is not None. Update param.py to ensure target is converted to string for consistency. Refactor tuner.py to utilize determine_target for improved clarity in target handling. * Remove automatic commit and push step from AMD and NVIDIA CI workflows to streamline the process and avoid unnecessary commits. * Add intrin_rule source files to CMakeLists.txt and implement hrsqrt function for half_t in common.h * lint fix * remove cmake dep in pyproject as it may lead to different cmake paths in diff stages * lint fix * Add cmake dependency to pyproject.toml and improve build logging in setup.py
b66f9aae · Lei Wang · GitHub · 021e44e3 · b66f9aae · b66f9aae
Unverified Commit b66f9aae authored Sep 02, 2025 by Lei Wang Committed by GitHub Sep 02, 2025
Showing with 177 additions and 34 deletions

CMakeLists.txt CMakeLists.txt +2 -0

setup.py setup.py +32 -34

src/target/intrin_rule_cuda.cc src/target/intrin_rule_cuda.cc +138 -0

src/tl_templates/cuda/common.h src/tl_templates/cuda/common.h +5 -0

No files found.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -124,6 +124,8 @@ tilelang_file_glob(GLOB TILE_LANG_SRCS
  src/target/rt_mod_cpp.cc
  # webgpu doesn't have system dependency
  src/target/codegen_webgpu.cc
+  # intrin_rule doesn't have system dependency
+  src/target/intrin_rule*.cc
 )

 # Include CUDA source files if CUDA is enabled

--- a/setup.py
+++ b/setup.py
@@ -203,6 +203,7 @@ def get_cplus_compiler():
    return None


+@functools.lru_cache(maxsize=None)
 def get_cython_compiler() -> Optional[str]:
    """Return the path to the Cython compiler.

@@ -238,6 +239,17 @@ def get_cython_compiler() -> Optional[str]:
    return None


+@functools.lru_cache(maxsize=None)
+def get_cmake_path() -> str:
+    """Return the path to the CMake compiler.
+    """
+    # found which cmake is used
+    cmake_path = shutil.which("cmake")
+    if not os.path.exists(cmake_path):
+        raise Exception("CMake is not installed, please install it first.")
+    return cmake_path
+
+
 def get_system_info():
    system = platform.system().lower()
    if system == "linux":
@@ -338,33 +350,6 @@ def update_submodules():
        raise RuntimeError("Failed to update submodules") from error


-def build_csrc(llvm_config_path):
-    """Configures and builds TVM."""
-
-    if not os.path.exists("build"):
-        os.makedirs("build")
-    os.chdir("build")
-    # Copy the config.cmake as a baseline
-    if not os.path.exists("config.cmake"):
-        shutil.copy("../3rdparty/tvm/cmake/config.cmake", "config.cmake")
-    # Set LLVM path and enable CUDA or ROCM in config.cmake
-    with open("config.cmake", "a") as config_file:
-        config_file.write(f"set(USE_LLVM {llvm_config_path})\n")
-        if USE_ROCM:
-            config_file.write(f"set(USE_ROCM {ROCM_HOME})\n")
-            config_file.write("set(USE_CUDA OFF)\n")
-        else:
-            config_file.write(f"set(USE_CUDA {CUDA_HOME})\n")
-            config_file.write("set(USE_ROCM OFF)\n")
-    # Run CMake and make
-    try:
-        subprocess.check_call(["cmake", ".."])
-        num_jobs = max(1, int(multiprocessing.cpu_count() * 0.75))
-        subprocess.check_call(["make", f"-j{num_jobs}"])
-    except subprocess.CalledProcessError as error:
-        raise RuntimeError("Failed to build TileLang C Source") from error
-
-
 def setup_llvm_for_tvm():
    """Downloads and extracts LLVM, then configures TVM to use it."""
    # Assume the download_and_extract_llvm function and its dependencies are defined elsewhere in this script
@@ -627,7 +612,10 @@ class TilelangExtensionBuild(build_ext):
    def run(self):
        # Check if CMake is installed and accessible by attempting to run 'cmake --version'.
        try:
-            subprocess.check_output(["cmake", "--version"])
+            cmake_path = get_cmake_path()
+            if not cmake_path:
+                raise Exception("CMake is not installed, please install it first.")
+            subprocess.check_output([cmake_path, "--version"])
        except OSError as error:
            # If CMake is not found, raise an error.
            raise RuntimeError(
@@ -830,15 +818,25 @@ class TilelangExtensionBuild(build_ext):
        else:
            print(f"[Config] No changes: {dst_config}")

+        cmake_path = get_cmake_path()
        # Run CMake to configure the project with the given arguments.
-        if not os.path.exists(build_temp + "/build.ninja"):
-            subprocess.check_call(["cmake", ext.sourcedir] + cmake_args, cwd=build_temp)
+        if not os.path.exists(os.path.join(build_temp, "build.ninja")):
+            logger.info(
+                f"[CMake] Generating build.ninja: {cmake_path} {ext.sourcedir} {' '.join(cmake_args)}"
+            )
+            subprocess.check_call([cmake_path, ext.sourcedir] + cmake_args, cwd=build_temp)
+        else:
+            logger.info(f"[CMake] build.ninja already exists in {build_temp}")

-        # Build the project in "Release" mode with all available CPU cores ("-j").
        num_jobs = max(1, int(multiprocessing.cpu_count() * 0.75))
-        subprocess.check_call(["cmake", "--build", ".", "--config", "Release", "-j",
-                               str(num_jobs)],
-                              cwd=build_temp)
+        logger.info(
+            f"[Build] Using {num_jobs} jobs | cmake: {cmake_path} (exists: {os.path.exists(cmake_path)}) | build dir: {build_temp}"
+        )
+
+        subprocess.check_call(
+            [cmake_path, "--build", ".", "--config", "Release", "-j",
+             str(num_jobs)],
+            cwd=build_temp)


 setup(

--- a/src/target/intrin_rule_cuda.cc
+++ b/src/target/intrin_rule_cuda.cc
+/*!
+ * \file intrin_rule_cuda.cc
+ * \brief CUDA intrinsic rules.
+ */
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/op_attr_types.h>
+
+#include "target/intrin_rule.h"
+
+namespace tvm {
+namespace codegen {
+namespace intrin {
+// Add float suffix to the intrinsics, CUDA fast math.
+using tir::FLowerIntrinsic;
+
+struct CUDAMath {
+  std::string operator()(DataType t, std::string name) const {
+    if (t.is_float()) {
+      switch (t.bits()) {
+      case 64:
+        return name;
+      case 32:
+        return name + 'f';
+      case 16: {
+        if (name == "fabs") {
+          return "__habs";
+        } else if (name == "round") {
+          return "hrint";
+        } else {
+          return "h" + name;
+        }
+      }
+      default:
+        return "";
+      }
+    } else if (t.is_bfloat16()) {
+      if (name == "fabs") {
+        return "__habs";
+      } else if (name == "round") {
+        return "hrint";
+      } else {
+        return "h" + name;
+      }
+    } else if (t.is_int() || t.is_uint()) {
+      switch (t.bits()) {
+      case 32:
+        return "__" + name;
+      case 64:
+        return "__" + name + "ll";
+      default:
+        return "";
+      }
+    }
+    return "";
+  }
+};
+
+struct CUDAFastMath : public CUDAMath {
+  std::string operator()(DataType t, std::string name) const {
+    if (t.is_float() && t.bits() == 32) {
+      return "__" + name + 'f';
+    } else {
+      return CUDAMath::operator()(t, name);
+    }
+    return "";
+  }
+};
+
+struct CUDAFastMathTan : public CUDAMath {
+  std::string operator()(DataType t, std::string name) const {
+    if (t.is_float()) {
+      switch (t.bits()) {
+      case 64:
+        return name;
+      // `__tanf` seems to produce some values too deviant from numpy tan
+      // version. So, let's use just `tanf` instead.
+      case 32:
+        return name + 'f';
+      case 16:
+        return 'h' + name;
+      default:
+        return "";
+      }
+    }
+    return "";
+  }
+};
+
+struct CUDAPopcount {
+  std::string operator()(DataType t, std::string name) const {
+    if (t.is_uint()) {
+      switch (t.bits()) {
+      case 32:
+        return "__popc";
+      case 64:
+        return "__popcll";
+      default:
+        return "";
+      }
+    }
+    return "";
+  }
+};
+
+struct CUDAWarpIntrinsic {
+  const Op operator()(DataType t, const Op &orig_op) const {
+    if (orig_op.same_as(builtin::tvm_warp_shuffle())) {
+      return Op::Get("tir.cuda.__shfl_sync");
+    } else if (orig_op.same_as(builtin::tvm_warp_shuffle_up())) {
+      return Op::Get("tir.cuda.__shfl_up_sync");
+    } else {
+      ICHECK(orig_op.same_as(builtin::tvm_warp_shuffle_down()));
+      return Op::Get("tir.cuda.__shfl_down_sync");
+    }
+  }
+};
+
+static PrimExpr DispatchCUDAWarpActiveMask(const PrimExpr &e) {
+  const CallNode *call = e.as<CallNode>();
+  return Call(call->dtype, Op::Get("tir.cuda.__activemask"), call->args);
+}
+
+template <typename T> static PrimExpr DispatchCUDAShuffle(const PrimExpr &e) {
+  const CallNode *call = e.as<CallNode>();
+  ICHECK(call != nullptr);
+  ICHECK_EQ(call->args.size(), 5); // mask, value, warp_id, width, warp_size
+  Array<PrimExpr> cuda_args{
+      {call->args[0], call->args[1], call->args[2], call->args[3]}};
+  return Call(call->dtype, T()(call->dtype, Downcast<Op>(call->op)), cuda_args);
+}
+
+TVM_REGISTER_OP("tir.rsqrt")
+    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic",
+                               DispatchPureExtern<CUDAMath>);
+
+} // namespace intrin
+} // namespace codegen
+} // namespace tvm
--- a/src/tl_templates/cuda/common.h
+++ b/src/tl_templates/cuda/common.h
@@ -55,6 +55,11 @@ TL_PATCH TL_DEVICE half_t __habs(const half_t x) {
  return half_t(__habs(x.to_half()));
 }

+// hrsqrt function for half_t
+TL_PATCH TL_DEVICE half_t hrsqrt(const half_t x) {
+  return half_t(hrsqrt(x.to_half()));
+}
+
 // Pack two half values.
 TL_DEVICE unsigned __pack_half2(const half x, const half y) {
  unsigned v0 = *((unsigned short *)&x);