Unverified Commit 400fad38 authored by Jiacheng Huang's avatar Jiacheng Huang Committed by GitHub
Browse files

issue/277: 添加 ReLU 算子的九齿实现

parent 95623d82
...@@ -52,10 +52,15 @@ python scripts/install.py [XMAKE_CONFIG_FLAGS] ...@@ -52,10 +52,15 @@ python scripts/install.py [XMAKE_CONFIG_FLAGS]
| `--iluvatar-gpu=[y\|n]` | 是否编译沐曦 GPU 接口实现 | n | `--iluvatar-gpu=[y\|n]` | 是否编译沐曦 GPU 接口实现 | n
| `--sugon-dcu=[y\|n]` | 是否编译曙光 DCU 接口实现 | n | `--sugon-dcu=[y\|n]` | 是否编译曙光 DCU 接口实现 | n
| `--kunlun-xpu=[y\|n]` | 是否编译昆仑 XPU 接口实现 | n | `--kunlun-xpu=[y\|n]` | 是否编译昆仑 XPU 接口实现 | n
| `--ninetoothed=[y\|n]` | 是否编译九齿实现 | n
| `--ccl=[y\|n]` | 是否编译 InfiniCCL 通信库接口实现 | n | `--ccl=[y\|n]` | 是否编译 InfiniCCL 通信库接口实现 | n
### 手动安装 ### 手动安装
0. 生成九齿算子(可选)
参见[使用九齿](#使用九齿)章节。
1. 项目配置 1. 项目配置
windows系统上,建议使用`xmake v2.8.9`编译项目。 windows系统上,建议使用`xmake v2.8.9`编译项目。
...@@ -131,6 +136,32 @@ xmake build infiniccl-test ...@@ -131,6 +136,32 @@ xmake build infiniccl-test
infiniccl-test --nvidia infiniccl-test --nvidia
``` ```
### 使用九齿
[九齿](https://github.com/InfiniTensor/ninetoothed)是一门基于 Triton 但提供更高层抽象的领域特定语言(DSL)。使用九齿可以降低算子的开发门槛,并且提高开发效率。
InfiniCore 目前已经可以接入使用九齿实现的算子,但是这部分实现的编译是默认关闭的。如果选择编译库中的九齿实现,需要使用 `--ninetoothed=y`,并在运行一键安装脚本前完成以下准备工作:
1. 安装九齿与[九齿算子库](https://github.com/InfiniTensor/ntops)
```shell
git clone https://github.com/InfiniTensor/ntops.git
cd ntops
pip install -e .
```
注:安装 `ntops` 时,`ninetoothed` 会被当成依赖也一并安装进来。
2.`InfiniCore` 文件夹下运行以下命令 AOT 编译库中的九齿算子:
```shell
PYTHONPATH=src/ python scripts/build_ntops.py
```
注:如果对九齿相关文件有修改,需要重新构建 InfiniCore 时,也需要同时运行以上命令进行重新生成。
3. 按照上面的指引进行[一键安装](#一键安装)或者[手动安装](#手动安装)
## 如何开源贡献 ## 如何开源贡献
[`InfiniCore开发者手册`](DEV.md) [`InfiniCore开发者手册`](DEV.md)
...@@ -24,6 +24,6 @@ def _find_and_build_ops(): ...@@ -24,6 +24,6 @@ def _find_and_build_ops():
if __name__ == "__main__": if __name__ == "__main__":
BUILD_DIRECTORY_PATH.mkdir(exist_ok=True) BUILD_DIRECTORY_PATH.mkdir(parents=True, exist_ok=True)
_find_and_build_ops() _find_and_build_ops()
...@@ -24,7 +24,7 @@ def build(premake, constexpr_param_grid, caller, op_name, output_dir): ...@@ -24,7 +24,7 @@ def build(premake, constexpr_param_grid, caller, op_name, output_dir):
for param_name, param_value in combination.items(): for param_name, param_value in combination.items():
if isinstance(param_value, str): if isinstance(param_value, str):
combination[param_name] = ( combination[param_name] = (
f"INFINI_DTYPE_{combination[param_name].replace('fp', 'F')}" f"INFINI_DTYPE_{combination[param_name].replace('fp', 'F').upper()}"
) )
combination = {f"{name}_": value for name, value in combination.items()} combination = {f"{name}_": value for name, value in combination.items()}
...@@ -77,9 +77,11 @@ def build(premake, constexpr_param_grid, caller, op_name, output_dir): ...@@ -77,9 +77,11 @@ def build(premake, constexpr_param_grid, caller, op_name, output_dir):
func_sig = f"NineToothedResult launch_{op_name}({param_decls})" func_sig = f"NineToothedResult launch_{op_name}({param_decls})"
joined_launches = "\n".join(launches)
op_decl = f'#ifdef __cplusplus\nextern "C" {func_sig};\n#else\n{func_sig};\n#endif' op_decl = f'#ifdef __cplusplus\nextern "C" {func_sig};\n#else\n{func_sig};\n#endif'
op_def = f"""{func_sig} {{ op_def = f"""{func_sig} {{
{"\n".join(launches)} {joined_launches}
return INFINI_STATUS_NOT_IMPLEMENTED; return INFINI_STATUS_NOT_IMPLEMENTED;
}}""" }}"""
......
#ifndef __RELU_METAX_API_H__
#define __RELU_METAX_API_H__
#ifdef ENABLE_NINETOOTHED
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR(relu, metax)
#endif
#endif // __RELU_METAX_API_H__
#ifdef ENABLE_NINETOOTHED
#include "../../../../../build/ninetoothed/relu.h"
#include "../../../devices/metax/metax_common.h"
#include "relu_metax.h"
namespace op::relu::metax {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &x_desc = input_desc_vec.at(0);
const auto &y_shape = out_desc->shape();
const auto &x_shape = x_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(y_shape, x_shape);
// create METAX elementwise descriptor
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
const auto &ndim{_info.getNdim()};
const auto &x_shape_{_info.getInputShape(0)};
const auto &x_strides_{_info.getInputStrides(0)};
std::vector<uint64_t> x_shape_vec{x_shape_, x_shape_ + ndim};
std::vector<int64_t> x_strides_vec{x_strides_, x_strides_ + ndim};
auto x_data{const_cast<void *>(inputs[0])};
auto x_shape{x_shape_vec.data()};
auto x_strides{x_strides_vec.data()};
const NineToothedTensor x{x_data, x_shape, x_strides};
const auto &y_shape_{_info.getOutputShape()};
const auto &y_strides_{_info.getOutputStrides()};
std::vector<uint64_t> y_shape_vec{y_shape_, y_shape_ + ndim};
std::vector<int64_t> y_strides_vec{y_strides_, y_strides_ + ndim};
auto y_data{output};
auto y_shape{y_shape_vec.data()};
auto y_strides{y_strides_vec.data()};
const NineToothedTensor y{y_data, y_shape, y_strides};
constexpr auto block_size{1024};
switch (_dtype) {
case INFINI_DTYPE_F16:
case INFINI_DTYPE_F32:
case INFINI_DTYPE_F64:
case INFINI_DTYPE_BF16:
if (launch_relu(stream, x, y, ndim, _dtype, block_size)) {
return INFINI_STATUS_INTERNAL_ERROR;
}
return INFINI_STATUS_SUCCESS;
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::relu::metax
#endif
import ninetoothed
from ntops.kernels import relu
import infiniop.ninetoothed.build
def build():
MAX_NDIM = 5
ndim_values = range(1, MAX_NDIM + 1)
dtype_values = (
ninetoothed.float16,
ninetoothed.bfloat16,
ninetoothed.float32,
ninetoothed.float64,
)
constexpr_param_grid = {
"ndim": ndim_values,
"dtype": dtype_values,
"block_size": (1024,),
}
infiniop.ninetoothed.build.build(
relu.premake,
constexpr_param_grid,
caller="cuda",
op_name="relu",
output_dir=infiniop.ninetoothed.build.BUILD_DIRECTORY_PATH,
)
#ifdef ENABLE_NINETOOTHED
#include "../../../../../build/ninetoothed/relu.h"
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "relu_nvidia.cuh"
namespace op::relu::nvidia {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &x_desc = input_desc_vec.at(0);
const auto &y_shape = out_desc->shape();
const auto &x_shape = x_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(y_shape, x_shape);
// create CUDA elementwise descriptor
CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
const auto &ndim{_info.getNdim()};
const auto &x_shape_{_info.getInputShape(0)};
const auto &x_strides_{_info.getInputStrides(0)};
std::vector<uint64_t> x_shape_vec{x_shape_, x_shape_ + ndim};
std::vector<int64_t> x_strides_vec{x_strides_, x_strides_ + ndim};
auto x_data{const_cast<void *>(inputs[0])};
auto x_shape{x_shape_vec.data()};
auto x_strides{x_strides_vec.data()};
const NineToothedTensor x{x_data, x_shape, x_strides};
const auto &y_shape_{_info.getOutputShape()};
const auto &y_strides_{_info.getOutputStrides()};
std::vector<uint64_t> y_shape_vec{y_shape_, y_shape_ + ndim};
std::vector<int64_t> y_strides_vec{y_strides_, y_strides_ + ndim};
auto y_data{output};
auto y_shape{y_shape_vec.data()};
auto y_strides{y_strides_vec.data()};
const NineToothedTensor y{y_data, y_shape, y_strides};
constexpr auto block_size{1024};
switch (_dtype) {
case INFINI_DTYPE_F16:
case INFINI_DTYPE_F32:
case INFINI_DTYPE_F64:
case INFINI_DTYPE_BF16:
if (launch_relu(stream, x, y, ndim, _dtype, block_size)) {
return INFINI_STATUS_INTERNAL_ERROR;
}
return INFINI_STATUS_SUCCESS;
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::relu::nvidia
#endif
#ifndef __RELU_NVIDIA_API_H__
#define __RELU_NVIDIA_API_H__
#ifdef ENABLE_NINETOOTHED
#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
ELEMENTWISE_DESCRIPTOR(relu, nvidia)
#endif
#endif // __RELU_NVIDIA_API_H__
...@@ -5,6 +5,16 @@ ...@@ -5,6 +5,16 @@
#ifdef ENABLE_CPU_API #ifdef ENABLE_CPU_API
#include "cpu/relu_cpu.h" #include "cpu/relu_cpu.h"
#endif #endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
#include "nvidia/relu_nvidia.cuh"
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_NINETOOTHED
#include "metax/relu_metax.h"
#endif
#endif
__C infiniStatus_t infiniopCreateReluDescriptor( __C infiniStatus_t infiniopCreateReluDescriptor(
infiniopHandle_t handle, infiniopHandle_t handle,
...@@ -24,6 +34,16 @@ __C infiniStatus_t infiniopCreateReluDescriptor( ...@@ -24,6 +34,16 @@ __C infiniStatus_t infiniopCreateReluDescriptor(
#ifdef ENABLE_CPU_API #ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu); CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_NINETOOTHED
CREATE(INFINI_DEVICE_METAX, metax);
#endif
#endif #endif
default: default:
...@@ -43,6 +63,16 @@ __C infiniStatus_t infiniopGetReluWorkspaceSize(infiniopReluDescriptor_t desc, s ...@@ -43,6 +63,16 @@ __C infiniStatus_t infiniopGetReluWorkspaceSize(infiniopReluDescriptor_t desc, s
switch (desc->device_type) { switch (desc->device_type) {
#ifdef ENABLE_CPU_API #ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu) GET(INFINI_DEVICE_CPU, cpu)
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
GET(INFINI_DEVICE_NVIDIA, nvidia)
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_NINETOOTHED
GET(INFINI_DEVICE_METAX, metax)
#endif
#endif #endif
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
...@@ -69,6 +99,16 @@ __C infiniStatus_t infiniopRelu( ...@@ -69,6 +99,16 @@ __C infiniStatus_t infiniopRelu(
#ifdef ENABLE_CPU_API #ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu); CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_NINETOOTHED
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
#endif #endif
default: default:
...@@ -90,6 +130,16 @@ infiniopDestroyReluDescriptor(infiniopReluDescriptor_t desc) { ...@@ -90,6 +130,16 @@ infiniopDestroyReluDescriptor(infiniopReluDescriptor_t desc) {
#ifdef ENABLE_CPU_API #ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu); DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_NINETOOTHED
DELETE(INFINI_DEVICE_METAX, metax);
#endif
#endif #endif
default: default:
......
...@@ -145,6 +145,17 @@ if has_config("kunlun-xpu") then ...@@ -145,6 +145,17 @@ if has_config("kunlun-xpu") then
includes("xmake/kunlun.lua") includes("xmake/kunlun.lua")
end end
-- 九齿
option("ninetoothed")
set_default(false)
set_showmenu(true)
set_description("Whether to complie NineToothed implementations")
option_end()
if has_config("ninetoothed") then
add_defines("ENABLE_NINETOOTHED")
end
-- InfiniCCL -- InfiniCCL
option("ccl") option("ccl")
set_default(false) set_default(false)
......
...@@ -23,6 +23,11 @@ rule("maca") ...@@ -23,6 +23,11 @@ rule("maca")
table.insert(args, "-I" .. includedir) table.insert(args, "-I" .. includedir)
end end
local defines = target:get("defines")
for _, define in ipairs(defines) do
table.insert(args, "-D" .. define)
end
os.execv(htcc, args) os.execv(htcc, args)
table.insert(target:objectfiles(), objectfile) table.insert(target:objectfiles(), objectfile)
end) end)
...@@ -36,6 +41,10 @@ target("infiniop-metax") ...@@ -36,6 +41,10 @@ target("infiniop-metax")
add_cxflags("-lstdc++", "-fPIC", "-Wno-defaulted-function-deleted", "-Wno-strict-aliasing") add_cxflags("-lstdc++", "-fPIC", "-Wno-defaulted-function-deleted", "-Wno-strict-aliasing")
add_files("../src/infiniop/devices/metax/*.cc", "../src/infiniop/ops/*/metax/*.cc") add_files("../src/infiniop/devices/metax/*.cc", "../src/infiniop/ops/*/metax/*.cc")
add_files("../src/infiniop/ops/*/metax/*.maca", {rule = "maca"}) add_files("../src/infiniop/ops/*/metax/*.maca", {rule = "maca"})
if has_config("ninetoothed") then
add_files("../build/ninetoothed/*.c", {cxflags = {"-include stdlib.h", "-Wno-return-type"}})
end
target_end() target_end()
target("infinirt-metax") target("infinirt-metax")
......
...@@ -21,6 +21,7 @@ target("infiniop-nvidia") ...@@ -21,6 +21,7 @@ target("infiniop-nvidia")
local nvcc = find_tool("nvcc") local nvcc = find_tool("nvcc")
if nvcc ~= nil then if nvcc ~= nil then
target:add("linkdirs", path.directory(path.directory(nvcc.program)) .. "/lib64/stubs") target:add("linkdirs", path.directory(path.directory(nvcc.program)) .. "/lib64/stubs")
target:add("links", "cuda")
end end
end) end)
...@@ -46,7 +47,11 @@ target("infiniop-nvidia") ...@@ -46,7 +47,11 @@ target("infiniop-nvidia")
add_cuflags("-Xcompiler=-Wno-error=deprecated-declarations") add_cuflags("-Xcompiler=-Wno-error=deprecated-declarations")
set_languages("cxx17") set_languages("cxx17")
add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu", "../build/ninetoothed/*.c") add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu")
if has_config("ninetoothed") then
add_files("../build/ninetoothed/*.c")
end
target_end() target_end()
target("infinirt-nvidia") target("infinirt-nvidia")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment