Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
8366330c
Unverified
Commit
8366330c
authored
Jul 02, 2025
by
PanZezhong1725
Committed by
GitHub
Jul 02, 2025
Browse files
Merge pull request #159 from YdrMaster/iluvatar
issue/158/feat: 支持天数
parents
f6a645a3
2d77211b
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
146 additions
and
36 deletions
+146
-36
README.md
README.md
+15
-12
src/infiniop/devices/cuda/cuda_common.cu
src/infiniop/devices/cuda/cuda_common.cu
+2
-0
src/infiniop/devices/cuda/cuda_kernel_common.cuh
src/infiniop/devices/cuda/cuda_kernel_common.cuh
+3
-0
src/infiniop/ops/causal_softmax/cuda/causal_softmax_cuda.cu
src/infiniop/ops/causal_softmax/cuda/causal_softmax_cuda.cu
+4
-0
src/infiniop/ops/clip/cuda/clip_cuda_internal.cuh
src/infiniop/ops/clip/cuda/clip_cuda_internal.cuh
+7
-9
src/infiniop/ops/gemm/cuda/gemm_cuda.cu
src/infiniop/ops/gemm/cuda/gemm_cuda.cu
+15
-1
src/infiniop/ops/gemm/info.h
src/infiniop/ops/gemm/info.h
+1
-4
src/infiniop/ops/rearrange/cuda/rearrange_kernel.cuh
src/infiniop/ops/rearrange/cuda/rearrange_kernel.cuh
+0
-9
src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cu
src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cu
+2
-0
src/infinirt/cuda/infinirt_cuda.cu
src/infinirt/cuda/infinirt_cuda.cu
+4
-0
xmake.lua
xmake.lua
+20
-1
xmake/iluvatar.lua
xmake/iluvatar.lua
+73
-0
No files found.
README.md
View file @
8366330c
...
...
@@ -38,18 +38,19 @@ python scripts/install.py [XMAKE_CONFIG_FLAGS]
参数
`XMAKE_CONFIG_FLAGS`
是 xmake 构建配置,可配置下列可选项:
| 选项 | 功能 | 默认值
|--------------------------|-------------------------------|:-:
|
`--omp=[y\|n]`
| 是否使用 OpenMP | y
|
`--cpu=[y\|n]`
| 是否编译 CPU 接口实现 | y
|
`--nv-gpu=[y\|n]`
| 是否编译英伟达 GPU 接口实现 | n
|
`--ascend-npu=[y\|n]`
| 是否编译昇腾 NPU 接口实现 | n
|
`--cambricon-mlu=[y\|n]`
| 是否编译寒武纪 MLU 接口实现 | n
|
`--metax-gpu=[y\|n]`
| 是否编译沐曦 GPU 接口实现 | n
|
`--moore-gpu=[y\|n]`
| 是否编译摩尔线程 GPU 接口实现 | n
|
`--sugon-dcu=[y\|n]`
| 是否编译曙光 DCU 接口实现 | n
|
`--kunlun-xpu=[y\|n]`
| 是否编译昆仑 XPU 接口实现 | n
|
`--ccl=[y\|n]`
| 是否编译 InfiniCCL 通信库接口实现 | n
| 选项 | 功能 | 默认值
|--------------------------|-----------------------------------|:-:
|
`--omp=[y\|n]`
| 是否使用 OpenMP | y
|
`--cpu=[y\|n]`
| 是否编译 CPU 接口实现 | y
|
`--nv-gpu=[y\|n]`
| 是否编译英伟达 GPU 接口实现 | n
|
`--ascend-npu=[y\|n]`
| 是否编译昇腾 NPU 接口实现 | n
|
`--cambricon-mlu=[y\|n]`
| 是否编译寒武纪 MLU 接口实现 | n
|
`--metax-gpu=[y\|n]`
| 是否编译沐曦 GPU 接口实现 | n
|
`--moore-gpu=[y\|n]`
| 是否编译摩尔线程 GPU 接口实现 | n
|
`--iluvatar-gpu=[y\|n]`
| 是否编译沐曦 GPU 接口实现 | n
|
`--sugon-dcu=[y\|n]`
| 是否编译曙光 DCU 接口实现 | n
|
`--kunlun-xpu=[y\|n]`
| 是否编译昆仑 XPU 接口实现 | n
|
`--ccl=[y\|n]`
| 是否编译 InfiniCCL 通信库接口实现 | n
### 手动安装
...
...
@@ -114,11 +115,13 @@ python scripts/python_test.py [--cpu | --nvidia | --cambricon | --ascend]
#### 通信库(InfiniCCL)测试
编译(需要先安装InfiniCCL):
```
shell
xmake build infiniccl-test
```
在英伟达平台运行测试(会自动使用所有可见的卡):
```
shell
infiniccl-test
--nvidia
```
...
...
src/infiniop/devices/cuda/cuda_common.cu
View file @
8366330c
...
...
@@ -68,8 +68,10 @@ cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
return
CUDNN_DATA_INT8
;
case
INFINI_DTYPE_I32
:
return
CUDNN_DATA_INT32
;
#ifndef ENABLE_ILUVATAR_CUDA_API
case
INFINI_DTYPE_I64
:
return
CUDNN_DATA_INT64
;
#endif
case
INFINI_DTYPE_U8
:
return
CUDNN_DATA_UINT8
;
default:
...
...
src/infiniop/devices/cuda/cuda_kernel_common.cuh
View file @
8366330c
...
...
@@ -6,6 +6,7 @@
// Posible maximum number of threads per block for CUDA architectures
// Used for picking correct kernel launch configuration
#define CUDA_BLOCK_SIZE_4096 4096
#define CUDA_BLOCK_SIZE_1024 1024
#define CUDA_BLOCK_SIZE_512 512
...
...
@@ -51,10 +52,12 @@ exp_(const float val) {
return
expf
(
val
);
}
#ifndef ENABLE_ILUVATAR_CUDA_API
__forceinline__
__device__
long
double
exp_
(
const
long
double
val
)
{
return
expl
(
val
);
}
#endif
__forceinline__
__device__
double
exp_
(
const
double
val
)
{
...
...
src/infiniop/ops/causal_softmax/cuda/causal_softmax_cuda.cu
View file @
8366330c
...
...
@@ -69,6 +69,10 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
CHECK_STATUS
(
launchKernel
<
CUDA_BLOCK_SIZE_512
>
(
y
,
x
,
_info
.
dtype
,
_info
.
batch_size
,
_info
.
seq_len
,
_info
.
total_seq_len
,
_info
.
y_stride_b
,
_info
.
y_stride_i
,
_info
.
x_stride_b
,
_info
.
x_stride_i
,
stream
));
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_4096
)
{
CHECK_STATUS
(
launchKernel
<
CUDA_BLOCK_SIZE_4096
>
(
y
,
x
,
_info
.
dtype
,
_info
.
batch_size
,
_info
.
seq_len
,
_info
.
total_seq_len
,
_info
.
y_stride_b
,
_info
.
y_stride_i
,
_info
.
x_stride_b
,
_info
.
x_stride_i
,
stream
));
}
else
{
return
INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED
;
}
...
...
src/infiniop/ops/clip/cuda/clip_cuda_internal.cuh
View file @
8366330c
...
...
@@ -2,6 +2,7 @@
#define __CLIP_CUDA_H__
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include <cuda_bf16.h>
#include <cuda_fp16.h>
namespace
op
::
clip
::
cuda
{
...
...
@@ -12,17 +13,14 @@ public:
template
<
typename
T
>
__device__
__forceinline__
T
operator
()(
const
T
&
x
,
const
T
&
min_val
,
const
T
&
max_val
)
const
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
)
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
||
std
::
is_same_v
<
T
,
nv_bfloat162
>
)
{
#ifndef ENABLE_ILUVATAR_CUDA_API
return
__hmax2
(
__hmin2
(
x
,
max_val
),
min_val
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
return
__hmax
(
__hmin
(
x
,
max_val
),
min_val
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
return
fmaxf
(
fminf
(
x
,
max_val
),
min_val
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
double
>
)
{
return
fmax
(
fmin
(
x
,
max_val
),
min_val
);
}
else
{
return
std
::
max
(
std
::
min
(
x
,
max_val
),
min_val
);
#else
return
{
std
::
clamp
(
x
.
x
,
min_val
.
x
,
max_val
.
x
),
std
::
clamp
(
x
.
y
,
min_val
.
y
,
max_val
.
y
)};
#endif
}
return
std
::
clamp
(
x
,
min_val
,
max_val
);
}
}
ClipOp
;
}
// namespace op::clip::cuda
...
...
src/infiniop/ops/gemm/cuda/gemm_cuda.cu
View file @
8366330c
...
...
@@ -43,20 +43,34 @@ infiniStatus_t Descriptor::calculate(
void
*
stream
)
const
{
cudaDataType
a_type
,
b_type
,
c_type
;
#ifdef ENABLE_ILUVATAR_CUDA_API
cudaDataType
compute_type
;
#else
cublasComputeType_t
compute_type
;
#endif
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
a_type
=
b_type
=
c_type
=
CUDA_R_16F
;
#ifdef ENABLE_ILUVATAR_CUDA_API
compute_type
=
CUDA_R_32F
;
#else
compute_type
=
CUBLAS_COMPUTE_32F
;
#endif
break
;
case
INFINI_DTYPE_BF16
:
a_type
=
b_type
=
c_type
=
CUDA_R_16BF
;
#ifdef ENABLE_ILUVATAR_CUDA_API
compute_type
=
CUDA_R_32F
;
#else
compute_type
=
CUBLAS_COMPUTE_32F
;
#endif
break
;
case
INFINI_DTYPE_F32
:
a_type
=
b_type
=
c_type
=
CUDA_R_32F
;
#ifdef ENABLE_SUGON_CUDA_API
#if defined ENABLE_ILUVATAR_CUDA_API
compute_type
=
CUDA_R_32F
;
#elif defined ENABLE_SUGON_CUDA_API
compute_type
=
CUBLAS_COMPUTE_32F
;
#else
compute_type
=
CUBLAS_COMPUTE_32F_FAST_TF32
;
...
...
src/infiniop/ops/gemm/info.h
View file @
8366330c
...
...
@@ -8,10 +8,7 @@
namespace
op
::
gemm
{
class
BlasMatrix
{
BlasMatrix
()
=
default
;
public:
struct
BlasMatrix
{
size_t
ndim
;
size_t
batch
;
ptrdiff_t
stride
;
...
...
src/infiniop/ops/rearrange/cuda/rearrange_kernel.cuh
View file @
8366330c
...
...
@@ -233,17 +233,8 @@ utils::Result<void *> getRearrangeKernel(const RearrangeParams ¶ms) {
CHECK_OR_RETURN
(
grid_num
<=
MAX_GRID_ARRAY_SIZE
&&
grid_num
!=
0
,
INFINI_STATUS_BAD_PARAM
);
CHECK_OR_RETURN
(
block_num
<=
MAX_BLOCK_ARRAY_SIZE
&&
block_num
!=
0
,
INFINI_STATUS_BAD_PARAM
);
CHECK_OR_RETURN
(
constraint_num
<=
2
,
INFINI_STATUS_BAD_PARAM
);
auto
block_len
=
params
.
block_len
.
data
();
auto
src_block_stride
=
params
.
src_block_stride
.
data
();
auto
dst_block_stride
=
params
.
dst_block_stride
.
data
();
auto
grid_len
=
params
.
grid_len
.
data
();
auto
src_grid_stride
=
params
.
src_grid_stride
.
data
();
auto
dst_grid_stride
=
params
.
dst_grid_stride
.
data
();
auto
constrain
=
params
.
constraints
.
data
();
void
*
kernel_func
=
nullptr
;
#define GET_REARRANGE_KERNEL(Tmem_type, block_array_size, grid_array_size, constraint_num) \
kernel_func = (void *)rearrange_unit_##Tmem_type##_block_##block_array_size##_grid_##grid_array_size##_constrain_##constraint_num;
...
...
src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cu
View file @
8366330c
...
...
@@ -95,6 +95,8 @@ infiniStatus_t Descriptor::calculate(
CHECK_STATUS
(
launchKernel
<
CUDA_BLOCK_SIZE_1024
>
(
batch_size
,
dim
,
y
,
_info
.
atype
,
stride_y
,
x
,
stride_x
,
w
,
_info
.
wtype
,
_info
.
epsilon
,
cuda_stream
));
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_512
)
{
CHECK_STATUS
(
launchKernel
<
CUDA_BLOCK_SIZE_512
>
(
batch_size
,
dim
,
y
,
_info
.
atype
,
stride_y
,
x
,
stride_x
,
w
,
_info
.
wtype
,
_info
.
epsilon
,
cuda_stream
));
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_4096
)
{
CHECK_STATUS
(
launchKernel
<
CUDA_BLOCK_SIZE_4096
>
(
batch_size
,
dim
,
y
,
_info
.
atype
,
stride_y
,
x
,
stride_x
,
w
,
_info
.
wtype
,
_info
.
epsilon
,
cuda_stream
));
}
else
{
return
INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED
;
}
...
...
src/infinirt/cuda/infinirt_cuda.cu
View file @
8366330c
...
...
@@ -38,8 +38,12 @@ infiniStatus_t streamSynchronize(infinirtStream_t stream) {
}
infiniStatus_t
streamWaitEvent
(
infinirtStream_t
stream
,
infinirtEvent_t
event
)
{
#ifdef ENABLE_ILUVATAR_CUDA_API
return
INFINI_STATUS_NOT_IMPLEMENTED
;
#else
CHECK_CUDART
(
cudaStreamWaitEvent
((
cudaStream_t
)
stream
,
(
cudaEvent_t
)
event
));
return
INFINI_STATUS_SUCCESS
;
#endif
}
infiniStatus_t
eventCreate
(
infinirtEvent_t
*
event_ptr
)
{
...
...
xmake.lua
View file @
8366330c
...
...
@@ -46,6 +46,19 @@ if has_config("nv-gpu") then
includes
(
"xmake/cuda.lua"
)
end
-- 天数智芯
option
(
"iluvatar-gpu"
)
set_default
(
false
)
set_showmenu
(
true
)
set_description
(
"Whether to complie implementations for Iluvatar GPU"
)
option_end
()
if
has_config
(
"iluvatar-gpu"
)
then
add_defines
(
"ENABLE_CUDA_API"
)
add_defines
(
"ENABLE_ILUVATAR_CUDA_API"
)
includes
(
"xmake/iluvatar.lua"
)
end
-- 寒武纪
option
(
"cambricon-mlu"
)
set_default
(
false
)
...
...
@@ -174,6 +187,9 @@ target("infinirt")
if
has_config
(
"moore-gpu"
)
then
add_deps
(
"infinirt-moore"
)
end
if
has_config
(
"iluvatar-gpu"
)
then
add_deps
(
"infinirt-iluvatar"
)
end
if
has_config
(
"kunlun-xpu"
)
then
add_deps
(
"infinirt-kunlun"
)
end
...
...
@@ -193,6 +209,9 @@ target("infiniop")
if
has_config
(
"nv-gpu"
)
then
add_deps
(
"infiniop-cuda"
)
end
if
has_config
(
"iluvatar-gpu"
)
then
add_deps
(
"infiniop-iluvatar"
)
end
if
has_config
(
"sugon-dcu"
)
then
local
builddir
=
string.format
(
"build/%s/%s/%s"
,
...
...
@@ -248,7 +267,7 @@ target("infiniccl")
if
has_config
(
"metax-gpu"
)
then
add_deps
(
"infiniccl-metax"
)
end
set_languages
(
"cxx17"
)
add_files
(
"src/infiniccl/*.cc"
)
...
...
xmake/iluvatar.lua
0 → 100644
View file @
8366330c
toolchain
(
"iluvatar.toolchain"
)
set_toolset
(
"cc"
,
"clang"
)
set_toolset
(
"cxx"
,
"clang++"
)
set_toolset
(
"cu"
,
"clang++"
)
set_toolset
(
"culd"
,
"clang++"
)
set_toolset
(
"cu-ccbin"
,
"$(env CXX)"
,
"$(env CC)"
)
toolchain_end
()
rule
(
"iluvatar.env"
)
add_deps
(
"cuda.env"
,
{
order
=
true
})
after_load
(
function
(
target
)
local
old
=
target
:
get
(
"syslinks"
)
local
new
=
{}
for
_
,
link
in
ipairs
(
old
)
do
if
link
~=
"cudadevrt"
then
table.insert
(
new
,
link
)
end
end
if
#
old
>
#
new
then
target
:
set
(
"syslinks"
,
new
)
local
log
=
"cudadevrt removed, syslinks = { "
for
_
,
link
in
ipairs
(
new
)
do
log
=
log
..
link
..
", "
end
log
=
log
:
sub
(
0
,
-
3
)
..
" }"
print
(
log
)
end
end
)
rule_end
()
target
(
"infiniop-iluvatar"
)
set_kind
(
"static"
)
add_deps
(
"infini-utils"
)
on_install
(
function
(
target
)
end
)
set_toolchains
(
"iluvatar.toolchain"
)
add_rules
(
"iluvatar.env"
)
set_values
(
"cuda.rdc"
,
false
)
add_links
(
"cudart"
,
"cublas"
,
"cudnn"
)
set_warnings
(
"all"
,
"error"
)
add_cuflags
(
"-fPIC"
,
"-x"
,
"ivcore"
,
"-std=c++17"
,
{
force
=
true
})
add_cuflags
(
"-fPIC"
)
add_culdflags
(
"-fPIC"
)
add_cxflags
(
"-fPIC"
)
-- set_languages("cxx17") 天数似乎不能用这个配置
add_files
(
"../src/infiniop/devices/cuda/*.cu"
,
"../src/infiniop/ops/*/cuda/*.cu"
)
target_end
()
target
(
"infinirt-iluvatar"
)
set_kind
(
"static"
)
add_deps
(
"infini-utils"
)
on_install
(
function
(
target
)
end
)
set_toolchains
(
"iluvatar.toolchain"
)
add_rules
(
"iluvatar.env"
)
set_values
(
"cuda.rdc"
,
false
)
add_links
(
"cudart"
)
set_warnings
(
"all"
,
"error"
)
add_cuflags
(
"-fPIC"
,
"-x"
,
"ivcore"
,
"-std=c++17"
,
{
force
=
true
})
add_cuflags
(
"-fPIC"
)
add_culdflags
(
"-fPIC"
)
add_cxflags
(
"-fPIC"
)
-- set_languages("cxx17") 天数似乎不能用这个配置
add_files
(
"../src/infinirt/cuda/*.cu"
)
target_end
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment