issue/277: 添加 ReLU 算子的九齿实现

400fad38 · Jiacheng Huang · GitHub · 95623d82 · 400fad38 · 400fad38
Unverified Commit 400fad38 authored Jul 15, 2025 by Jiacheng Huang Committed by GitHub Jul 15, 2025
12 changed files
--- a/README.md
+++ b/README.md
@@ -52,10 +52,15 @@ python scripts/install.py [XMAKE_CONFIG_FLAGS]
 | `--iluvatar-gpu=[y\|n]`  | 是否编译沐曦 GPU 接口实现         | n
 | `--sugon-dcu=[y\|n]`     | 是否编译曙光 DCU 接口实现         | n
 | `--kunlun-xpu=[y\|n]`    | 是否编译昆仑 XPU 接口实现         | n
+| `--ninetoothed=[y\|n]`   | 是否编译九齿实现                 | n
 | `--ccl=[y\|n]`           | 是否编译 InfiniCCL 通信库接口实现 | n

 ### 手动安装

+0. 生成九齿算子（可选）
+
+    参见[使用九齿](#使用九齿)章节。
+
 1. 项目配置

   windows系统上，建议使用`xmake v2.8.9`编译项目。
@@ -131,6 +136,32 @@ xmake build infiniccl-test
 infiniccl-test --nvidia
 ```

+### 使用九齿
+
+[九齿](https://github.com/InfiniTensor/ninetoothed)是一门基于 Triton 但提供更高层抽象的领域特定语言（DSL）。使用九齿可以降低算子的开发门槛，并且提高开发效率。
+
+InfiniCore 目前已经可以接入使用九齿实现的算子，但是这部分实现的编译是默认关闭的。如果选择编译库中的九齿实现，需要使用 `--ninetoothed=y`，并在运行一键安装脚本前完成以下准备工作：
+
+1. 安装九齿与[九齿算子库](https://github.com/InfiniTensor/ntops)：
+
+```shell
+git clone https://github.com/InfiniTensor/ntops.git
+cd ntops
+pip install -e .
+```
+
+注：安装 `ntops` 时，`ninetoothed` 会被当成依赖也一并安装进来。
+
+2. 在 `InfiniCore` 文件夹下运行以下命令 AOT 编译库中的九齿算子：
+
+```shell
+PYTHONPATH=src/ python scripts/build_ntops.py
+```
+
+注：如果对九齿相关文件有修改，需要重新构建 InfiniCore 时，也需要同时运行以上命令进行重新生成。
+
+3. 按照上面的指引进行[一键安装](#一键安装)或者[手动安装](#手动安装)。
+
 ## 如何开源贡献

 见 [`InfiniCore开发者手册`](DEV.md)。
--- a/scripts/build_ntops.py
+++ b/scripts/build_ntops.py
@@ -24,6 +24,6 @@ def _find_and_build_ops():


 if __name__ == "__main__":
-    BUILD_DIRECTORY_PATH.mkdir(exist_ok=True)
+    BUILD_DIRECTORY_PATH.mkdir(parents=True, exist_ok=True)

    _find_and_build_ops()
--- a/src/infiniop/ninetoothed/build.py
+++ b/src/infiniop/ninetoothed/build.py
@@ -24,7 +24,7 @@ def build(premake, constexpr_param_grid, caller, op_name, output_dir):
        for param_name, param_value in combination.items():
            if isinstance(param_value, str):
                combination[param_name] = (
-                    f"INFINI_DTYPE_{combination[param_name].replace('fp', 'F')}"
+                    f"INFINI_DTYPE_{combination[param_name].replace('fp', 'F').upper()}"
                )

        combination = {f"{name}_": value for name, value in combination.items()}
@@ -77,9 +77,11 @@ def build(premake, constexpr_param_grid, caller, op_name, output_dir):

    func_sig = f"NineToothedResult launch_{op_name}({param_decls})"

+    joined_launches = "\n".join(launches)
+
    op_decl = f'#ifdef __cplusplus\nextern "C" {func_sig};\n#else\n{func_sig};\n#endif'
    op_def = f"""{func_sig} {{
-{"\n".join(launches)}
+{joined_launches}
    return INFINI_STATUS_NOT_IMPLEMENTED;
 }}"""


--- a/src/infiniop/ops/relu/metax/relu_metax.h
+++ b/src/infiniop/ops/relu/metax/relu_metax.h
+#ifndef __RELU_METAX_API_H__
+#define __RELU_METAX_API_H__
+
+#ifdef ENABLE_NINETOOTHED
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(relu, metax)
+
+#endif
+
+#endif // __RELU_METAX_API_H__
--- a/src/infiniop/ops/relu/metax/relu_metax.maca
+++ b/src/infiniop/ops/relu/metax/relu_metax.maca
+#ifdef ENABLE_NINETOOTHED
+
+#include "../../../../../build/ninetoothed/relu.h"
+#include "../../../devices/metax/metax_common.h"
+#include "relu_metax.h"
+
+namespace op::relu::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create METAX elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    const auto &ndim{_info.getNdim()};
+    const auto &x_shape_{_info.getInputShape(0)};
+    const auto &x_strides_{_info.getInputStrides(0)};
+    std::vector<uint64_t> x_shape_vec{x_shape_, x_shape_ + ndim};
+    std::vector<int64_t> x_strides_vec{x_strides_, x_strides_ + ndim};
+    auto x_data{const_cast<void *>(inputs[0])};
+    auto x_shape{x_shape_vec.data()};
+    auto x_strides{x_strides_vec.data()};
+    const NineToothedTensor x{x_data, x_shape, x_strides};
+    const auto &y_shape_{_info.getOutputShape()};
+    const auto &y_strides_{_info.getOutputStrides()};
+    std::vector<uint64_t> y_shape_vec{y_shape_, y_shape_ + ndim};
+    std::vector<int64_t> y_strides_vec{y_strides_, y_strides_ + ndim};
+    auto y_data{output};
+    auto y_shape{y_shape_vec.data()};
+    auto y_strides{y_strides_vec.data()};
+    const NineToothedTensor y{y_data, y_shape, y_strides};
+    constexpr auto block_size{1024};
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+    case INFINI_DTYPE_F32:
+    case INFINI_DTYPE_F64:
+    case INFINI_DTYPE_BF16:
+        if (launch_relu(stream, x, y, ndim, _dtype, block_size)) {
+            return INFINI_STATUS_INTERNAL_ERROR;
+        }
+        return INFINI_STATUS_SUCCESS;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::relu::metax
+
+#endif
--- a/src/infiniop/ops/relu/ninetoothed/build.py
+++ b/src/infiniop/ops/relu/ninetoothed/build.py
+import ninetoothed
+from ntops.kernels import relu
+
+import infiniop.ninetoothed.build
+
+
+def build():
+    MAX_NDIM = 5
+
+    ndim_values = range(1, MAX_NDIM + 1)
+    dtype_values = (
+        ninetoothed.float16,
+        ninetoothed.bfloat16,
+        ninetoothed.float32,
+        ninetoothed.float64,
+    )
+
+    constexpr_param_grid = {
+        "ndim": ndim_values,
+        "dtype": dtype_values,
+        "block_size": (1024,),
+    }
+
+    infiniop.ninetoothed.build.build(
+        relu.premake,
+        constexpr_param_grid,
+        caller="cuda",
+        op_name="relu",
+        output_dir=infiniop.ninetoothed.build.BUILD_DIRECTORY_PATH,
+    )
--- a/src/infiniop/ops/relu/nvidia/relu_nvidia.cu
+++ b/src/infiniop/ops/relu/nvidia/relu_nvidia.cu
+#ifdef ENABLE_NINETOOTHED
+
+#include "../../../../../build/ninetoothed/relu.h"
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "relu_nvidia.cuh"
+
+namespace op::relu::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    const auto &ndim{_info.getNdim()};
+    const auto &x_shape_{_info.getInputShape(0)};
+    const auto &x_strides_{_info.getInputStrides(0)};
+    std::vector<uint64_t> x_shape_vec{x_shape_, x_shape_ + ndim};
+    std::vector<int64_t> x_strides_vec{x_strides_, x_strides_ + ndim};
+    auto x_data{const_cast<void *>(inputs[0])};
+    auto x_shape{x_shape_vec.data()};
+    auto x_strides{x_strides_vec.data()};
+    const NineToothedTensor x{x_data, x_shape, x_strides};
+    const auto &y_shape_{_info.getOutputShape()};
+    const auto &y_strides_{_info.getOutputStrides()};
+    std::vector<uint64_t> y_shape_vec{y_shape_, y_shape_ + ndim};
+    std::vector<int64_t> y_strides_vec{y_strides_, y_strides_ + ndim};
+    auto y_data{output};
+    auto y_shape{y_shape_vec.data()};
+    auto y_strides{y_strides_vec.data()};
+    const NineToothedTensor y{y_data, y_shape, y_strides};
+    constexpr auto block_size{1024};
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+    case INFINI_DTYPE_F32:
+    case INFINI_DTYPE_F64:
+    case INFINI_DTYPE_BF16:
+        if (launch_relu(stream, x, y, ndim, _dtype, block_size)) {
+            return INFINI_STATUS_INTERNAL_ERROR;
+        }
+        return INFINI_STATUS_SUCCESS;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::relu::nvidia
+
+#endif
--- a/src/infiniop/ops/relu/nvidia/relu_nvidia.cuh
+++ b/src/infiniop/ops/relu/nvidia/relu_nvidia.cuh
+#ifndef __RELU_NVIDIA_API_H__
+#define __RELU_NVIDIA_API_H__
+
+#ifdef ENABLE_NINETOOTHED
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(relu, nvidia)
+
+#endif
+
+#endif // __RELU_NVIDIA_API_H__
--- a/src/infiniop/ops/relu/operator.cc
+++ b/src/infiniop/ops/relu/operator.cc
@@ -5,6 +5,16 @@
 #ifdef ENABLE_CPU_API
 #include "cpu/relu_cpu.h"
 #endif
+#ifdef ENABLE_NVIDIA_API
+#ifdef ENABLE_NINETOOTHED
+#include "nvidia/relu_nvidia.cuh"
+#endif
+#endif
+#ifdef ENABLE_METAX_API
+#ifdef ENABLE_NINETOOTHED
+#include "metax/relu_metax.h"
+#endif
+#endif

 __C infiniStatus_t infiniopCreateReluDescriptor(
    infiniopHandle_t handle,
@@ -24,6 +34,16 @@ __C infiniStatus_t infiniopCreateReluDescriptor(

 #ifdef ENABLE_CPU_API
        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+#ifdef ENABLE_NINETOOTHED
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#endif
+#ifdef ENABLE_METAX_API
+#ifdef ENABLE_NINETOOTHED
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
 #endif

    default:
@@ -43,6 +63,16 @@ __C infiniStatus_t infiniopGetReluWorkspaceSize(infiniopReluDescriptor_t desc, s
    switch (desc->device_type) {
 #ifdef ENABLE_CPU_API
        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+#ifdef ENABLE_NINETOOTHED
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#endif
+#ifdef ENABLE_METAX_API
+#ifdef ENABLE_NINETOOTHED
+        GET(INFINI_DEVICE_METAX, metax)
+#endif
 #endif
    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -69,6 +99,16 @@ __C infiniStatus_t infiniopRelu(

 #ifdef ENABLE_CPU_API
        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+#ifdef ENABLE_NINETOOTHED
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#endif
+#ifdef ENABLE_METAX_API
+#ifdef ENABLE_NINETOOTHED
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
 #endif

    default:
@@ -90,6 +130,16 @@ infiniopDestroyReluDescriptor(infiniopReluDescriptor_t desc) {

 #ifdef ENABLE_CPU_API
        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+#ifdef ENABLE_NINETOOTHED
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#endif
+#ifdef ENABLE_METAX_API
+#ifdef ENABLE_NINETOOTHED
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
 #endif

    default:

--- a/xmake.lua
+++ b/xmake.lua
@@ -145,6 +145,17 @@ if has_config("kunlun-xpu") then
    includes("xmake/kunlun.lua")
 end

+-- 九齿
+option("ninetoothed")
+    set_default(false)
+    set_showmenu(true)
+    set_description("Whether to complie NineToothed implementations")
+option_end()
+
+if has_config("ninetoothed") then
+    add_defines("ENABLE_NINETOOTHED")
+end
+
 -- InfiniCCL
 option("ccl")
    set_default(false)

--- a/xmake/metax.lua
+++ b/xmake/metax.lua
@@ -23,6 +23,11 @@ rule("maca")
            table.insert(args, "-I" .. includedir)
        end

+        local defines = target:get("defines")
+        for _, define in ipairs(defines) do
+            table.insert(args, "-D" .. define)
+        end
+
        os.execv(htcc, args)
        table.insert(target:objectfiles(), objectfile)
    end)
@@ -36,6 +41,10 @@ target("infiniop-metax")
    add_cxflags("-lstdc++", "-fPIC", "-Wno-defaulted-function-deleted", "-Wno-strict-aliasing")
    add_files("../src/infiniop/devices/metax/*.cc", "../src/infiniop/ops/*/metax/*.cc")
    add_files("../src/infiniop/ops/*/metax/*.maca", {rule = "maca"})
+
+    if has_config("ninetoothed") then
+        add_files("../build/ninetoothed/*.c", {cxflags = {"-include stdlib.h", "-Wno-return-type"}})
+    end
 target_end()

 target("infinirt-metax")

--- a/xmake/nvidia.lua
+++ b/xmake/nvidia.lua
@@ -21,6 +21,7 @@ target("infiniop-nvidia")
        local nvcc = find_tool("nvcc")
        if nvcc ~= nil then
            target:add("linkdirs", path.directory(path.directory(nvcc.program)) .. "/lib64/stubs")
+            target:add("links", "cuda")
        end
    end)

@@ -46,7 +47,11 @@ target("infiniop-nvidia")
    add_cuflags("-Xcompiler=-Wno-error=deprecated-declarations")

    set_languages("cxx17")
-    add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu", "../build/ninetoothed/*.c")
+    add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu")
+
+    if has_config("ninetoothed") then
+        add_files("../build/ninetoothed/*.c")
+    end
 target_end()

 target("infinirt-nvidia")