Unverified Commit 8366330c authored by PanZezhong1725's avatar PanZezhong1725 Committed by GitHub
Browse files

Merge pull request #159 from YdrMaster/iluvatar

issue/158/feat: 支持天数
parents f6a645a3 2d77211b
......@@ -38,18 +38,19 @@ python scripts/install.py [XMAKE_CONFIG_FLAGS]
参数 `XMAKE_CONFIG_FLAGS` 是 xmake 构建配置,可配置下列可选项:
| 选项 | 功能 | 默认值
|--------------------------|-------------------------------|:-:
| `--omp=[y\|n]` | 是否使用 OpenMP | y
| `--cpu=[y\|n]` | 是否编译 CPU 接口实现 | y
| `--nv-gpu=[y\|n]` | 是否编译英伟达 GPU 接口实现 | n
| `--ascend-npu=[y\|n]` | 是否编译昇腾 NPU 接口实现 | n
| `--cambricon-mlu=[y\|n]` | 是否编译寒武纪 MLU 接口实现 | n
| `--metax-gpu=[y\|n]` | 是否编译沐曦 GPU 接口实现 | n
| `--moore-gpu=[y\|n]` | 是否编译摩尔线程 GPU 接口实现 | n
| `--sugon-dcu=[y\|n]` | 是否编译曙光 DCU 接口实现 | n
| `--kunlun-xpu=[y\|n]` | 是否编译昆仑 XPU 接口实现 | n
| `--ccl=[y\|n]` | 是否编译 InfiniCCL 通信库接口实现 | n
| 选项 | 功能 | 默认值
|--------------------------|-----------------------------------|:-:
| `--omp=[y\|n]` | 是否使用 OpenMP | y
| `--cpu=[y\|n]` | 是否编译 CPU 接口实现 | y
| `--nv-gpu=[y\|n]` | 是否编译英伟达 GPU 接口实现 | n
| `--ascend-npu=[y\|n]` | 是否编译昇腾 NPU 接口实现 | n
| `--cambricon-mlu=[y\|n]` | 是否编译寒武纪 MLU 接口实现 | n
| `--metax-gpu=[y\|n]` | 是否编译沐曦 GPU 接口实现 | n
| `--moore-gpu=[y\|n]` | 是否编译摩尔线程 GPU 接口实现 | n
| `--iluvatar-gpu=[y\|n]` | 是否编译沐曦 GPU 接口实现 | n
| `--sugon-dcu=[y\|n]` | 是否编译曙光 DCU 接口实现 | n
| `--kunlun-xpu=[y\|n]` | 是否编译昆仑 XPU 接口实现 | n
| `--ccl=[y\|n]` | 是否编译 InfiniCCL 通信库接口实现 | n
### 手动安装
......@@ -114,11 +115,13 @@ python scripts/python_test.py [--cpu | --nvidia | --cambricon | --ascend]
#### 通信库(InfiniCCL)测试
编译(需要先安装InfiniCCL):
```shell
xmake build infiniccl-test
```
在英伟达平台运行测试(会自动使用所有可见的卡):
```shell
infiniccl-test --nvidia
```
......
......@@ -68,8 +68,10 @@ cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
return CUDNN_DATA_INT8;
case INFINI_DTYPE_I32:
return CUDNN_DATA_INT32;
#ifndef ENABLE_ILUVATAR_CUDA_API
case INFINI_DTYPE_I64:
return CUDNN_DATA_INT64;
#endif
case INFINI_DTYPE_U8:
return CUDNN_DATA_UINT8;
default:
......
......@@ -6,6 +6,7 @@
// Posible maximum number of threads per block for CUDA architectures
// Used for picking correct kernel launch configuration
#define CUDA_BLOCK_SIZE_4096 4096
#define CUDA_BLOCK_SIZE_1024 1024
#define CUDA_BLOCK_SIZE_512 512
......@@ -51,10 +52,12 @@ exp_(const float val) {
return expf(val);
}
#ifndef ENABLE_ILUVATAR_CUDA_API
__forceinline__ __device__ long double
exp_(const long double val) {
return expl(val);
}
#endif
__forceinline__ __device__ double
exp_(const double val) {
......
......@@ -69,6 +69,10 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_512>(
y, x, _info.dtype, _info.batch_size, _info.seq_len, _info.total_seq_len,
_info.y_stride_b, _info.y_stride_i, _info.x_stride_b, _info.x_stride_i, stream));
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
y, x, _info.dtype, _info.batch_size, _info.seq_len, _info.total_seq_len,
_info.y_stride_b, _info.y_stride_i, _info.x_stride_b, _info.x_stride_i, stream));
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
......
......@@ -2,6 +2,7 @@
#define __CLIP_CUDA_H__
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include <cuda_bf16.h>
#include <cuda_fp16.h>
namespace op::clip::cuda {
......@@ -12,17 +13,14 @@ public:
template <typename T>
__device__ __forceinline__ T operator()(const T &x, const T &min_val, const T &max_val) const {
if constexpr (std::is_same_v<T, half2>) {
if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) {
#ifndef ENABLE_ILUVATAR_CUDA_API
return __hmax2(__hmin2(x, max_val), min_val);
} else if constexpr (std::is_same_v<T, half>) {
return __hmax(__hmin(x, max_val), min_val);
} else if constexpr (std::is_same_v<T, float>) {
return fmaxf(fminf(x, max_val), min_val);
} else if constexpr (std::is_same_v<T, double>) {
return fmax(fmin(x, max_val), min_val);
} else {
return std::max(std::min(x, max_val), min_val);
#else
return {std::clamp(x.x, min_val.x, max_val.x), std::clamp(x.y, min_val.y, max_val.y)};
#endif
}
return std::clamp(x, min_val, max_val);
}
} ClipOp;
} // namespace op::clip::cuda
......
......@@ -43,20 +43,34 @@ infiniStatus_t Descriptor::calculate(
void *stream) const {
cudaDataType a_type, b_type, c_type;
#ifdef ENABLE_ILUVATAR_CUDA_API
cudaDataType compute_type;
#else
cublasComputeType_t compute_type;
#endif
switch (_dtype) {
case INFINI_DTYPE_F16:
a_type = b_type = c_type = CUDA_R_16F;
#ifdef ENABLE_ILUVATAR_CUDA_API
compute_type = CUDA_R_32F;
#else
compute_type = CUBLAS_COMPUTE_32F;
#endif
break;
case INFINI_DTYPE_BF16:
a_type = b_type = c_type = CUDA_R_16BF;
#ifdef ENABLE_ILUVATAR_CUDA_API
compute_type = CUDA_R_32F;
#else
compute_type = CUBLAS_COMPUTE_32F;
#endif
break;
case INFINI_DTYPE_F32:
a_type = b_type = c_type = CUDA_R_32F;
#ifdef ENABLE_SUGON_CUDA_API
#if defined ENABLE_ILUVATAR_CUDA_API
compute_type = CUDA_R_32F;
#elif defined ENABLE_SUGON_CUDA_API
compute_type = CUBLAS_COMPUTE_32F;
#else
compute_type = CUBLAS_COMPUTE_32F_FAST_TF32;
......
......@@ -8,10 +8,7 @@
namespace op::gemm {
class BlasMatrix {
BlasMatrix() = default;
public:
struct BlasMatrix {
size_t ndim;
size_t batch;
ptrdiff_t stride;
......
......@@ -233,17 +233,8 @@ utils::Result<void *> getRearrangeKernel(const RearrangeParams &params) {
CHECK_OR_RETURN(grid_num <= MAX_GRID_ARRAY_SIZE && grid_num != 0, INFINI_STATUS_BAD_PARAM);
CHECK_OR_RETURN(block_num <= MAX_BLOCK_ARRAY_SIZE && block_num != 0, INFINI_STATUS_BAD_PARAM);
CHECK_OR_RETURN(constraint_num <= 2, INFINI_STATUS_BAD_PARAM);
auto block_len = params.block_len.data();
auto src_block_stride = params.src_block_stride.data();
auto dst_block_stride = params.dst_block_stride.data();
auto grid_len = params.grid_len.data();
auto src_grid_stride = params.src_grid_stride.data();
auto dst_grid_stride = params.dst_grid_stride.data();
auto constrain = params.constraints.data();
void *kernel_func = nullptr;
#define GET_REARRANGE_KERNEL(Tmem_type, block_array_size, grid_array_size, constraint_num) \
kernel_func = (void *)rearrange_unit_##Tmem_type##_block_##block_array_size##_grid_##grid_array_size##_constrain_##constraint_num;
......
......@@ -95,6 +95,8 @@ infiniStatus_t Descriptor::calculate(
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_1024>(batch_size, dim, y, _info.atype, stride_y, x, stride_x, w, _info.wtype, _info.epsilon, cuda_stream));
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_512>(batch_size, dim, y, _info.atype, stride_y, x, stride_x, w, _info.wtype, _info.epsilon, cuda_stream));
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(batch_size, dim, y, _info.atype, stride_y, x, stride_x, w, _info.wtype, _info.epsilon, cuda_stream));
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
......
......@@ -38,8 +38,12 @@ infiniStatus_t streamSynchronize(infinirtStream_t stream) {
}
infiniStatus_t streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event) {
#ifdef ENABLE_ILUVATAR_CUDA_API
return INFINI_STATUS_NOT_IMPLEMENTED;
#else
CHECK_CUDART(cudaStreamWaitEvent((cudaStream_t)stream, (cudaEvent_t)event));
return INFINI_STATUS_SUCCESS;
#endif
}
infiniStatus_t eventCreate(infinirtEvent_t *event_ptr) {
......
......@@ -46,6 +46,19 @@ if has_config("nv-gpu") then
includes("xmake/cuda.lua")
end
-- 天数智芯
option("iluvatar-gpu")
set_default(false)
set_showmenu(true)
set_description("Whether to complie implementations for Iluvatar GPU")
option_end()
if has_config("iluvatar-gpu") then
add_defines("ENABLE_CUDA_API")
add_defines("ENABLE_ILUVATAR_CUDA_API")
includes("xmake/iluvatar.lua")
end
-- 寒武纪
option("cambricon-mlu")
set_default(false)
......@@ -174,6 +187,9 @@ target("infinirt")
if has_config("moore-gpu") then
add_deps("infinirt-moore")
end
if has_config("iluvatar-gpu") then
add_deps("infinirt-iluvatar")
end
if has_config("kunlun-xpu") then
add_deps("infinirt-kunlun")
end
......@@ -193,6 +209,9 @@ target("infiniop")
if has_config("nv-gpu") then
add_deps("infiniop-cuda")
end
if has_config("iluvatar-gpu") then
add_deps("infiniop-iluvatar")
end
if has_config("sugon-dcu") then
local builddir = string.format(
"build/%s/%s/%s",
......@@ -248,7 +267,7 @@ target("infiniccl")
if has_config("metax-gpu") then
add_deps("infiniccl-metax")
end
set_languages("cxx17")
add_files("src/infiniccl/*.cc")
......
toolchain("iluvatar.toolchain")
set_toolset("cc" , "clang" )
set_toolset("cxx" , "clang++")
set_toolset("cu" , "clang++")
set_toolset("culd", "clang++")
set_toolset("cu-ccbin", "$(env CXX)", "$(env CC)")
toolchain_end()
rule("iluvatar.env")
add_deps("cuda.env", {order = true})
after_load(function (target)
local old = target:get("syslinks")
local new = {}
for _, link in ipairs(old) do
if link ~= "cudadevrt" then
table.insert(new, link)
end
end
if #old > #new then
target:set("syslinks", new)
local log = "cudadevrt removed, syslinks = { "
for _, link in ipairs(new) do
log = log .. link .. ", "
end
log = log:sub(0, -3) .. " }"
print(log)
end
end)
rule_end()
target("infiniop-iluvatar")
set_kind("static")
add_deps("infini-utils")
on_install(function (target) end)
set_toolchains("iluvatar.toolchain")
add_rules("iluvatar.env")
set_values("cuda.rdc", false)
add_links("cudart", "cublas", "cudnn")
set_warnings("all", "error")
add_cuflags("-fPIC", "-x", "ivcore", "-std=c++17", {force = true})
add_cuflags("-fPIC")
add_culdflags("-fPIC")
add_cxflags("-fPIC")
-- set_languages("cxx17") 天数似乎不能用这个配置
add_files("../src/infiniop/devices/cuda/*.cu", "../src/infiniop/ops/*/cuda/*.cu")
target_end()
target("infinirt-iluvatar")
set_kind("static")
add_deps("infini-utils")
on_install(function (target) end)
set_toolchains("iluvatar.toolchain")
add_rules("iluvatar.env")
set_values("cuda.rdc", false)
add_links("cudart")
set_warnings("all", "error")
add_cuflags("-fPIC", "-x", "ivcore", "-std=c++17", {force = true})
add_cuflags("-fPIC")
add_culdflags("-fPIC")
add_cxflags("-fPIC")
-- set_languages("cxx17") 天数似乎不能用这个配置
add_files("../src/infinirt/cuda/*.cu")
target_end()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment