Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
400fad38
Unverified
Commit
400fad38
authored
Jul 15, 2025
by
Jiacheng Huang
Committed by
GitHub
Jul 15, 2025
Browse files
issue/277: 添加 ReLU 算子的九齿实现
parent
95623d82
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
326 additions
and
4 deletions
+326
-4
README.md
README.md
+31
-0
scripts/build_ntops.py
scripts/build_ntops.py
+1
-1
src/infiniop/ninetoothed/build.py
src/infiniop/ninetoothed/build.py
+4
-2
src/infiniop/ops/relu/metax/relu_metax.h
src/infiniop/ops/relu/metax/relu_metax.h
+12
-0
src/infiniop/ops/relu/metax/relu_metax.maca
src/infiniop/ops/relu/metax/relu_metax.maca
+80
-0
src/infiniop/ops/relu/ninetoothed/build.py
src/infiniop/ops/relu/ninetoothed/build.py
+30
-0
src/infiniop/ops/relu/nvidia/relu_nvidia.cu
src/infiniop/ops/relu/nvidia/relu_nvidia.cu
+80
-0
src/infiniop/ops/relu/nvidia/relu_nvidia.cuh
src/infiniop/ops/relu/nvidia/relu_nvidia.cuh
+12
-0
src/infiniop/ops/relu/operator.cc
src/infiniop/ops/relu/operator.cc
+50
-0
xmake.lua
xmake.lua
+11
-0
xmake/metax.lua
xmake/metax.lua
+9
-0
xmake/nvidia.lua
xmake/nvidia.lua
+6
-1
No files found.
README.md
View file @
400fad38
...
...
@@ -52,10 +52,15 @@ python scripts/install.py [XMAKE_CONFIG_FLAGS]
|
`--iluvatar-gpu=[y\|n]`
| 是否编译沐曦 GPU 接口实现 | n
|
`--sugon-dcu=[y\|n]`
| 是否编译曙光 DCU 接口实现 | n
|
`--kunlun-xpu=[y\|n]`
| 是否编译昆仑 XPU 接口实现 | n
|
`--ninetoothed=[y\|n]`
| 是否编译九齿实现 | n
|
`--ccl=[y\|n]`
| 是否编译 InfiniCCL 通信库接口实现 | n
### 手动安装
0.
生成九齿算子(可选)
参见[使用九齿](#使用九齿)章节。
1.
项目配置
windows系统上,建议使用
`xmake v2.8.9`
编译项目。
...
...
@@ -131,6 +136,32 @@ xmake build infiniccl-test
infiniccl-test
--nvidia
```
### 使用九齿
[
九齿
](
https://github.com/InfiniTensor/ninetoothed
)
是一门基于 Triton 但提供更高层抽象的领域特定语言(DSL)。使用九齿可以降低算子的开发门槛,并且提高开发效率。
InfiniCore 目前已经可以接入使用九齿实现的算子,但是这部分实现的编译是默认关闭的。如果选择编译库中的九齿实现,需要使用
`--ninetoothed=y`
,并在运行一键安装脚本前完成以下准备工作:
1.
安装九齿与
[
九齿算子库
](
https://github.com/InfiniTensor/ntops
)
:
```
shell
git clone https://github.com/InfiniTensor/ntops.git
cd
ntops
pip
install
-e
.
```
注:安装
`ntops`
时,
`ninetoothed`
会被当成依赖也一并安装进来。
2.
在
`InfiniCore`
文件夹下运行以下命令 AOT 编译库中的九齿算子:
```
shell
PYTHONPATH
=
src/ python scripts/build_ntops.py
```
注:如果对九齿相关文件有修改,需要重新构建 InfiniCore 时,也需要同时运行以上命令进行重新生成。
3.
按照上面的指引进行
[
一键安装
](
#一键安装
)
或者
[
手动安装
](
#手动安装
)
。
## 如何开源贡献
见
[
`InfiniCore开发者手册`
](
DEV.md
)
。
scripts/build_ntops.py
View file @
400fad38
...
...
@@ -24,6 +24,6 @@ def _find_and_build_ops():
if
__name__
==
"__main__"
:
BUILD_DIRECTORY_PATH
.
mkdir
(
exist_ok
=
True
)
BUILD_DIRECTORY_PATH
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
_find_and_build_ops
()
src/infiniop/ninetoothed/build.py
View file @
400fad38
...
...
@@ -24,7 +24,7 @@ def build(premake, constexpr_param_grid, caller, op_name, output_dir):
for
param_name
,
param_value
in
combination
.
items
():
if
isinstance
(
param_value
,
str
):
combination
[
param_name
]
=
(
f
"INFINI_DTYPE_
{
combination
[
param_name
].
replace
(
'fp'
,
'F'
)
}
"
f
"INFINI_DTYPE_
{
combination
[
param_name
].
replace
(
'fp'
,
'F'
)
.
upper
()
}
"
)
combination
=
{
f
"
{
name
}
_"
:
value
for
name
,
value
in
combination
.
items
()}
...
...
@@ -77,9 +77,11 @@ def build(premake, constexpr_param_grid, caller, op_name, output_dir):
func_sig
=
f
"NineToothedResult launch_
{
op_name
}
(
{
param_decls
}
)"
joined_launches
=
"
\n
"
.
join
(
launches
)
op_decl
=
f
'#ifdef __cplusplus
\n
extern "C"
{
func_sig
}
;
\n
#else
\n
{
func_sig
}
;
\n
#endif'
op_def
=
f
"""
{
func_sig
}
{{
{
"
\n
"
.
join
(
launches
)
}
{
join
ed_
launches
}
return INFINI_STATUS_NOT_IMPLEMENTED;
}}"""
...
...
src/infiniop/ops/relu/metax/relu_metax.h
0 → 100644
View file @
400fad38
#ifndef __RELU_METAX_API_H__
#define __RELU_METAX_API_H__
#ifdef ENABLE_NINETOOTHED
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR
(
relu
,
metax
)
#endif
#endif // __RELU_METAX_API_H__
src/infiniop/ops/relu/metax/relu_metax.maca
0 → 100644
View file @
400fad38
#ifdef ENABLE_NINETOOTHED
#include "../../../../../build/ninetoothed/relu.h"
#include "../../../devices/metax/metax_common.h"
#include "relu_metax.h"
namespace op::relu::metax {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &x_desc = input_desc_vec.at(0);
const auto &y_shape = out_desc->shape();
const auto &x_shape = x_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(y_shape, x_shape);
// create METAX elementwise descriptor
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
const auto &ndim{_info.getNdim()};
const auto &x_shape_{_info.getInputShape(0)};
const auto &x_strides_{_info.getInputStrides(0)};
std::vector<uint64_t> x_shape_vec{x_shape_, x_shape_ + ndim};
std::vector<int64_t> x_strides_vec{x_strides_, x_strides_ + ndim};
auto x_data{const_cast<void *>(inputs[0])};
auto x_shape{x_shape_vec.data()};
auto x_strides{x_strides_vec.data()};
const NineToothedTensor x{x_data, x_shape, x_strides};
const auto &y_shape_{_info.getOutputShape()};
const auto &y_strides_{_info.getOutputStrides()};
std::vector<uint64_t> y_shape_vec{y_shape_, y_shape_ + ndim};
std::vector<int64_t> y_strides_vec{y_strides_, y_strides_ + ndim};
auto y_data{output};
auto y_shape{y_shape_vec.data()};
auto y_strides{y_strides_vec.data()};
const NineToothedTensor y{y_data, y_shape, y_strides};
constexpr auto block_size{1024};
switch (_dtype) {
case INFINI_DTYPE_F16:
case INFINI_DTYPE_F32:
case INFINI_DTYPE_F64:
case INFINI_DTYPE_BF16:
if (launch_relu(stream, x, y, ndim, _dtype, block_size)) {
return INFINI_STATUS_INTERNAL_ERROR;
}
return INFINI_STATUS_SUCCESS;
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::relu::metax
#endif
src/infiniop/ops/relu/ninetoothed/build.py
0 → 100644
View file @
400fad38
import
ninetoothed
from
ntops.kernels
import
relu
import
infiniop.ninetoothed.build
def
build
():
MAX_NDIM
=
5
ndim_values
=
range
(
1
,
MAX_NDIM
+
1
)
dtype_values
=
(
ninetoothed
.
float16
,
ninetoothed
.
bfloat16
,
ninetoothed
.
float32
,
ninetoothed
.
float64
,
)
constexpr_param_grid
=
{
"ndim"
:
ndim_values
,
"dtype"
:
dtype_values
,
"block_size"
:
(
1024
,),
}
infiniop
.
ninetoothed
.
build
.
build
(
relu
.
premake
,
constexpr_param_grid
,
caller
=
"cuda"
,
op_name
=
"relu"
,
output_dir
=
infiniop
.
ninetoothed
.
build
.
BUILD_DIRECTORY_PATH
,
)
src/infiniop/ops/relu/nvidia/relu_nvidia.cu
0 → 100644
View file @
400fad38
#ifdef ENABLE_NINETOOTHED
#include "../../../../../build/ninetoothed/relu.h"
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "relu_nvidia.cuh"
namespace
op
::
relu
::
nvidia
{
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
)
{
auto
handle
=
reinterpret_cast
<
device
::
nvidia
::
Handle
*>
(
handle_
);
auto
dtype
=
out_desc
->
dtype
();
const
auto
&
x_desc
=
input_desc_vec
.
at
(
0
);
const
auto
&
y_shape
=
out_desc
->
shape
();
const
auto
&
x_shape
=
x_desc
->
shape
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
,
INFINI_DTYPE_BF16
);
CHECK_SAME_SHAPE
(
y_shape
,
x_shape
);
// create CUDA elementwise descriptor
CREATE_ELEMENTWISE_CUDA_DESCRIPTOR
(
handle
,
dtype
,
out_desc
,
input_desc_vec
)
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
{
if
(
workspace_size
<
_workspace_size
)
{
return
INFINI_STATUS_INSUFFICIENT_WORKSPACE
;
}
const
auto
&
ndim
{
_info
.
getNdim
()};
const
auto
&
x_shape_
{
_info
.
getInputShape
(
0
)};
const
auto
&
x_strides_
{
_info
.
getInputStrides
(
0
)};
std
::
vector
<
uint64_t
>
x_shape_vec
{
x_shape_
,
x_shape_
+
ndim
};
std
::
vector
<
int64_t
>
x_strides_vec
{
x_strides_
,
x_strides_
+
ndim
};
auto
x_data
{
const_cast
<
void
*>
(
inputs
[
0
])};
auto
x_shape
{
x_shape_vec
.
data
()};
auto
x_strides
{
x_strides_vec
.
data
()};
const
NineToothedTensor
x
{
x_data
,
x_shape
,
x_strides
};
const
auto
&
y_shape_
{
_info
.
getOutputShape
()};
const
auto
&
y_strides_
{
_info
.
getOutputStrides
()};
std
::
vector
<
uint64_t
>
y_shape_vec
{
y_shape_
,
y_shape_
+
ndim
};
std
::
vector
<
int64_t
>
y_strides_vec
{
y_strides_
,
y_strides_
+
ndim
};
auto
y_data
{
output
};
auto
y_shape
{
y_shape_vec
.
data
()};
auto
y_strides
{
y_strides_vec
.
data
()};
const
NineToothedTensor
y
{
y_data
,
y_shape
,
y_strides
};
constexpr
auto
block_size
{
1024
};
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
case
INFINI_DTYPE_F32
:
case
INFINI_DTYPE_F64
:
case
INFINI_DTYPE_BF16
:
if
(
launch_relu
(
stream
,
x
,
y
,
ndim
,
_dtype
,
block_size
))
{
return
INFINI_STATUS_INTERNAL_ERROR
;
}
return
INFINI_STATUS_SUCCESS
;
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::relu::nvidia
#endif
src/infiniop/ops/relu/nvidia/relu_nvidia.cuh
0 → 100644
View file @
400fad38
#ifndef __RELU_NVIDIA_API_H__
#define __RELU_NVIDIA_API_H__
#ifdef ENABLE_NINETOOTHED
#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
ELEMENTWISE_DESCRIPTOR
(
relu
,
nvidia
)
#endif
#endif // __RELU_NVIDIA_API_H__
src/infiniop/ops/relu/operator.cc
View file @
400fad38
...
...
@@ -5,6 +5,16 @@
#ifdef ENABLE_CPU_API
#include "cpu/relu_cpu.h"
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
#include "nvidia/relu_nvidia.cuh"
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_NINETOOTHED
#include "metax/relu_metax.h"
#endif
#endif
__C
infiniStatus_t
infiniopCreateReluDescriptor
(
infiniopHandle_t
handle
,
...
...
@@ -24,6 +34,16 @@ __C infiniStatus_t infiniopCreateReluDescriptor(
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_NINETOOTHED
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#endif
default:
...
...
@@ -43,6 +63,16 @@ __C infiniStatus_t infiniopGetReluWorkspaceSize(infiniopReluDescriptor_t desc, s
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
)
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_NINETOOTHED
GET
(
INFINI_DEVICE_METAX
,
metax
)
#endif
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -69,6 +99,16 @@ __C infiniStatus_t infiniopRelu(
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_NINETOOTHED
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#endif
default:
...
...
@@ -90,6 +130,16 @@ infiniopDestroyReluDescriptor(infiniopReluDescriptor_t desc) {
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_NINETOOTHED
DELETE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#endif
default:
...
...
xmake.lua
View file @
400fad38
...
...
@@ -145,6 +145,17 @@ if has_config("kunlun-xpu") then
includes
(
"xmake/kunlun.lua"
)
end
-- 九齿
option
(
"ninetoothed"
)
set_default
(
false
)
set_showmenu
(
true
)
set_description
(
"Whether to complie NineToothed implementations"
)
option_end
()
if
has_config
(
"ninetoothed"
)
then
add_defines
(
"ENABLE_NINETOOTHED"
)
end
-- InfiniCCL
option
(
"ccl"
)
set_default
(
false
)
...
...
xmake/metax.lua
View file @
400fad38
...
...
@@ -23,6 +23,11 @@ rule("maca")
table.insert
(
args
,
"-I"
..
includedir
)
end
local
defines
=
target
:
get
(
"defines"
)
for
_
,
define
in
ipairs
(
defines
)
do
table.insert
(
args
,
"-D"
..
define
)
end
os
.
execv
(
htcc
,
args
)
table.insert
(
target
:
objectfiles
(),
objectfile
)
end
)
...
...
@@ -36,6 +41,10 @@ target("infiniop-metax")
add_cxflags
(
"-lstdc++"
,
"-fPIC"
,
"-Wno-defaulted-function-deleted"
,
"-Wno-strict-aliasing"
)
add_files
(
"../src/infiniop/devices/metax/*.cc"
,
"../src/infiniop/ops/*/metax/*.cc"
)
add_files
(
"../src/infiniop/ops/*/metax/*.maca"
,
{
rule
=
"maca"
})
if
has_config
(
"ninetoothed"
)
then
add_files
(
"../build/ninetoothed/*.c"
,
{
cxflags
=
{
"-include stdlib.h"
,
"-Wno-return-type"
}})
end
target_end
()
target
(
"infinirt-metax"
)
...
...
xmake/nvidia.lua
View file @
400fad38
...
...
@@ -21,6 +21,7 @@ target("infiniop-nvidia")
local
nvcc
=
find_tool
(
"nvcc"
)
if
nvcc
~=
nil
then
target
:
add
(
"linkdirs"
,
path
.
directory
(
path
.
directory
(
nvcc
.
program
))
..
"/lib64/stubs"
)
target
:
add
(
"links"
,
"cuda"
)
end
end
)
...
...
@@ -46,7 +47,11 @@ target("infiniop-nvidia")
add_cuflags
(
"-Xcompiler=-Wno-error=deprecated-declarations"
)
set_languages
(
"cxx17"
)
add_files
(
"../src/infiniop/devices/nvidia/*.cu"
,
"../src/infiniop/ops/*/nvidia/*.cu"
,
"../build/ninetoothed/*.c"
)
add_files
(
"../src/infiniop/devices/nvidia/*.cu"
,
"../src/infiniop/ops/*/nvidia/*.cu"
)
if
has_config
(
"ninetoothed"
)
then
add_files
(
"../build/ninetoothed/*.c"
)
end
target_end
()
target
(
"infinirt-nvidia"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment