Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
e4605f7c
Unverified
Commit
e4605f7c
authored
Jul 11, 2025
by
PanZezhong1725
Committed by
GitHub
Jul 11, 2025
Browse files
Merge pull request #293 from YdrMaster/distinct-cuda
issue291 合并 cuda 代码
parents
5025ebed
eac2b0ca
Changes
68
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
37 additions
and
99 deletions
+37
-99
src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cu
src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cu
+11
-8
src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cuh
src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cuh
+1
-1
src/infiniop/ops/swiglu/operator.cc
src/infiniop/ops/swiglu/operator.cc
+14
-19
src/infiniop/reduce/cuda/reduce.cuh
src/infiniop/reduce/cuda/reduce.cuh
+5
-2
src/infiniop/reduce/maca/reduce.h
src/infiniop/reduce/maca/reduce.h
+0
-63
xmake.lua
xmake.lua
+2
-2
xmake/cuda.lua
xmake/cuda.lua
+1
-1
xmake/metax.lua
xmake/metax.lua
+3
-3
No files found.
src/infiniop/ops/swiglu/
cud
a/swiglu_
cud
a.cu
→
src/infiniop/ops/swiglu/
nvidi
a/swiglu_
nvidi
a.cu
View file @
e4605f7c
#include "swiglu_cuda.cuh"
#include "swiglu_nvidia.cuh"
#include "swiglu_cuda_internal.cuh"
namespace
op
::
swiglu
::
cuda
{
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include "../cuda/kernel.cuh"
namespace
op
::
swiglu
::
nvidia
{
Descriptor
::~
Descriptor
()
=
default
;
Descriptor
::~
Descriptor
()
=
default
;
...
@@ -42,17 +45,17 @@ infiniStatus_t Descriptor::calculate(
...
@@ -42,17 +45,17 @@ infiniStatus_t Descriptor::calculate(
switch
(
_dtype
)
{
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
case
INFINI_DTYPE_F16
:
return
_device_info
->
calculate
<
256
,
SwiGLUOp
,
half
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
SwiGLUOp
,
half
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_BF16
:
case
INFINI_DTYPE_BF16
:
return
_device_info
->
calculate
<
256
,
SwiGLUOp
,
__nv
_bfloat16
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
SwiGLUOp
,
cuda
_bfloat16
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F32
:
case
INFINI_DTYPE_F32
:
return
_device_info
->
calculate
<
256
,
SwiGLUOp
,
float
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
SwiGLUOp
,
float
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
256
,
SwiGLUOp
,
double
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
SwiGLUOp
,
double
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
default:
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
}
return
INFINI_STATUS_SUCCESS
;
return
INFINI_STATUS_SUCCESS
;
}
}
}
// namespace op::swiglu::
cud
a
}
// namespace op::swiglu::
nvidi
a
src/infiniop/ops/swiglu/
cud
a/swiglu_
cud
a.cuh
→
src/infiniop/ops/swiglu/
nvidi
a/swiglu_
nvidi
a.cuh
View file @
e4605f7c
...
@@ -3,6 +3,6 @@
...
@@ -3,6 +3,6 @@
#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
ELEMENTWISE_DESCRIPTOR
(
swiglu
,
cuda
)
ELEMENTWISE_DESCRIPTOR
(
swiglu
,
nvidia
,
cuda
)
#endif // __SWIGLU_CUDA_API_H__
#endif // __SWIGLU_CUDA_API_H__
src/infiniop/ops/swiglu/operator.cc
View file @
e4605f7c
...
@@ -6,13 +6,13 @@
...
@@ -6,13 +6,13 @@
#include "cpu/swiglu_cpu.h"
#include "cpu/swiglu_cpu.h"
#endif
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
#include "
cud
a/swiglu_
cud
a.cuh"
#include "
nvidi
a/swiglu_
nvidi
a.cuh"
#endif
#endif
#ifdef ENABLE_KUNLUN_API
#ifdef ENABLE_KUNLUN_API
#include "kunlun/swiglu_kunlun.h"
#include "kunlun/swiglu_kunlun.h"
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_METAX_API
#include "m
aca
/swiglu_m
aca
.h"
#include "m
etax
/swiglu_m
etax
.h"
#endif
#endif
#ifdef ENABLE_ASCEND_API
#ifdef ENABLE_ASCEND_API
#include "ascend/swiglu_ascend.h"
#include "ascend/swiglu_ascend.h"
...
@@ -40,13 +40,13 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
...
@@ -40,13 +40,13 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cud
a
);
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidi
a
);
#endif
#endif
#ifdef ENABLE_KUNLUN_API
#ifdef ENABLE_KUNLUN_API
CREATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
CREATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
m
aca
);
CREATE
(
INFINI_DEVICE_METAX
,
m
etax
);
#endif
#endif
#ifdef ENABLE_CAMBRICON_MLU
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
case
DevCambriconMlu
:
{
...
@@ -83,20 +83,20 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
...
@@ -83,20 +83,20 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
#define GET(CASE, NAMESPACE) \
#define GET(CASE, NAMESPACE) \
case CASE: \
case CASE: \
*size = reinterpret_cast<op::swiglu::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
*size = reinterpret_cast<op::swiglu::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
;
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
)
GET
(
INFINI_DEVICE_CPU
,
cpu
)
;
#endif
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
cud
a
)
GET
(
INFINI_DEVICE_NVIDIA
,
nvidi
a
)
;
#endif
#endif
#ifdef ENABLE_KUNLUN_API
#ifdef ENABLE_KUNLUN_API
GET
(
INFINI_DEVICE_KUNLUN
,
kunlun
)
GET
(
INFINI_DEVICE_KUNLUN
,
kunlun
)
;
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
m
aca
);
GET
(
INFINI_DEVICE_METAX
,
m
etax
);
#endif
#endif
#ifdef ENABLE_CAMBRICON_MLU
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
case
DevCambriconMlu
:
{
...
@@ -104,12 +104,7 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
...
@@ -104,12 +104,7 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
}
}
#endif
#endif
#ifdef ENABLE_ASCEND_API
#ifdef ENABLE_ASCEND_API
GET
(
INFINI_DEVICE_ASCEND
,
ascend
)
GET
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaGetSwiGLUWorkspaceSize
((
SwiGLUMacaDescriptor_t
)
desc
,
size
);
}
#endif
#endif
#ifdef ENABLE_MTHREADS_GPU
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
case
DevMthreadsGpu
:
{
...
@@ -143,13 +138,13 @@ __C infiniStatus_t infiniopSwiGLU(
...
@@ -143,13 +138,13 @@ __C infiniStatus_t infiniopSwiGLU(
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cud
a
);
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidi
a
);
#endif
#endif
#ifdef ENABLE_KUNLUN_API
#ifdef ENABLE_KUNLUN_API
CALCULATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
CALCULATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
m
aca
);
CALCULATE
(
INFINI_DEVICE_METAX
,
m
etax
);
#endif
#endif
#ifdef ENABLE_CAMBRICON_MLU
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
case
DevCambriconMlu
:
{
...
@@ -189,13 +184,13 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
...
@@ -189,13 +184,13 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
cud
a
);
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidi
a
);
#endif
#endif
#ifdef ENABLE_KUNLUN_API
#ifdef ENABLE_KUNLUN_API
DELETE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
DELETE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
m
aca
);
DELETE
(
INFINI_DEVICE_METAX
,
m
etax
);
#endif
#endif
#ifdef ENABLE_CAMBRICON_MLU
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
case
DevCambriconMlu
:
{
...
...
src/infiniop/reduce/cuda/reduce.cuh
View file @
e4605f7c
#ifndef __INFINIOP_REDUCE_CUDA_H__
#ifndef __INFINIOP_REDUCE_CUDA_H__
#define __INFINIOP_REDUCE_CUDA_H__
#define __INFINIOP_REDUCE_CUDA_H__
#include <cub/block/block_reduce.cuh>
/*
/*
* Device functions for reduction operations on CUDA.
* Device functions for reduction operations on CUDA.
*
*
* Note: Only local result on thread 0 is guranteed to be correct.
* Note: Only local result on thread 0 is guranteed to be correct.
* A manual broadcast is needed for other threads.
* A manual broadcast is needed for other threads.
*
* Important Note: This is a device-independent header file containing reduce kernels
* for all cuda-supporting platforms. Include device-specific headers
* (such as <cub/block/block_reduce.cuh> for nvidia) in your source file
* and then include this file for proper usage.
*/
*/
namespace
op
::
common_cuda
::
reduce_op
{
namespace
op
::
common_cuda
::
reduce_op
{
...
...
src/infiniop/reduce/maca/reduce.h
deleted
100644 → 0
View file @
5025ebed
#ifndef __INFINIOP_REDUCE_MACA_H__
#define __INFINIOP_REDUCE_MACA_H__
#include <hccub/block/block_reduce.cuh>
/*
* Device functions for reduction operations on MACA.
*
* Note: Only local result on thread 0 is guranteed to be correct.
* A manual broadcast is needed for other threads.
*/
namespace
op
::
common_maca
::
reduce_op
{
// Sum(x^2) on contiguous data of length count
template
<
unsigned
int
BLOCK_SIZE
,
typename
Tdata
,
typename
Tcompute
>
__device__
__forceinline__
Tcompute
sumSquared
(
const
Tdata
*
data_ptr
,
size_t
count
)
{
Tcompute
ss
=
0
;
// Each thread computes its partial sum
for
(
size_t
i
=
threadIdx
.
x
;
i
<
count
;
i
+=
BLOCK_SIZE
)
{
ss
+=
Tcompute
(
data_ptr
[
i
])
*
Tcompute
(
data_ptr
[
i
]);
}
// Use CUB block-level reduction
using
BlockReduce
=
cub
::
BlockReduce
<
Tcompute
,
BLOCK_SIZE
>
;
__shared__
typename
BlockReduce
::
TempStorage
temp_storage
;
return
BlockReduce
(
temp_storage
).
Sum
(
ss
);
}
// Sum(x) on contiguous data of length count
template
<
unsigned
int
BLOCK_SIZE
,
typename
Tdata
,
typename
Tcompute
>
__device__
__forceinline__
Tcompute
sum
(
const
Tdata
*
data_ptr
,
size_t
count
)
{
Tcompute
s
=
0
;
for
(
size_t
i
=
threadIdx
.
x
;
i
<
count
;
i
+=
BLOCK_SIZE
)
{
s
+=
Tcompute
(
data_ptr
[
i
]);
}
using
BlockReduce
=
cub
::
BlockReduce
<
Tcompute
,
BLOCK_SIZE
>
;
__shared__
typename
BlockReduce
::
TempStorage
temp_storage
;
return
BlockReduce
(
temp_storage
).
Sum
(
s
);
}
// Max(x) on contiguous data of length count
template
<
unsigned
int
BLOCK_SIZE
,
typename
Tdata
>
__device__
__forceinline__
Tdata
max
(
const
Tdata
*
data_ptr
,
size_t
count
)
{
Tdata
max_
=
data_ptr
[
0
];
for
(
size_t
i
=
threadIdx
.
x
;
i
<
count
;
i
+=
BLOCK_SIZE
)
{
max_
=
cub
::
Max
()(
max_
,
data_ptr
[
i
]);
}
using
BlockReduce
=
cub
::
BlockReduce
<
Tdata
,
BLOCK_SIZE
>
;
__shared__
typename
BlockReduce
::
TempStorage
temp_storage
;
return
BlockReduce
(
temp_storage
).
Reduce
(
max_
,
cub
::
Max
(),
BLOCK_SIZE
);
}
}
// namespace op::common_maca::reduce_op
#endif
xmake.lua
View file @
e4605f7c
...
@@ -108,7 +108,7 @@ option_end()
...
@@ -108,7 +108,7 @@ option_end()
if
has_config
(
"metax-gpu"
)
then
if
has_config
(
"metax-gpu"
)
then
add_defines
(
"ENABLE_METAX_API"
)
add_defines
(
"ENABLE_METAX_API"
)
includes
(
"xmake/m
aca
.lua"
)
includes
(
"xmake/m
etax
.lua"
)
end
end
-- 摩尔线程
-- 摩尔线程
...
@@ -174,7 +174,7 @@ target("infini-utils")
...
@@ -174,7 +174,7 @@ target("infini-utils")
add_cxflags
(
"-fPIC"
,
"-Wno-unknown-pragmas"
)
add_cxflags
(
"-fPIC"
,
"-Wno-unknown-pragmas"
)
if
has_config
(
"omp"
)
then
if
has_config
(
"omp"
)
then
add_cxflags
(
"-fopenmp"
)
add_cxflags
(
"-fopenmp"
)
add_ldflags
(
"-fopenmp"
)
add_ldflags
(
"-fopenmp"
,
{
force
=
true
}
)
end
end
end
end
...
...
xmake/cuda.lua
View file @
e4605f7c
...
@@ -46,7 +46,7 @@ target("infiniop-cuda")
...
@@ -46,7 +46,7 @@ target("infiniop-cuda")
add_cuflags
(
"-Xcompiler=-Wno-error=deprecated-declarations"
)
add_cuflags
(
"-Xcompiler=-Wno-error=deprecated-declarations"
)
set_languages
(
"cxx17"
)
set_languages
(
"cxx17"
)
add_files
(
"../src/infiniop/devices/cuda/*.cu"
,
"../src/infiniop/ops/*/cuda/*.cu"
,
"../build/ninetoothed/*.c"
)
add_files
(
"../src/infiniop/devices/cuda/*.cu"
,
"../src/infiniop/ops/*/cuda/*.cu"
,
"../src/infiniop/ops/*/nvidia/*.cu"
,
"../build/ninetoothed/*.c"
)
target_end
()
target_end
()
target
(
"infinirt-cuda"
)
target
(
"infinirt-cuda"
)
...
...
xmake/m
aca
.lua
→
xmake/m
etax
.lua
View file @
e4605f7c
...
@@ -34,8 +34,8 @@ target("infiniop-metax")
...
@@ -34,8 +34,8 @@ target("infiniop-metax")
set_languages
(
"cxx17"
)
set_languages
(
"cxx17"
)
set_warnings
(
"all"
,
"error"
)
set_warnings
(
"all"
,
"error"
)
add_cxflags
(
"-lstdc++"
,
"-fPIC"
,
"-Wno-defaulted-function-deleted"
,
"-Wno-strict-aliasing"
)
add_cxflags
(
"-lstdc++"
,
"-fPIC"
,
"-Wno-defaulted-function-deleted"
,
"-Wno-strict-aliasing"
)
add_files
(
"../src/infiniop/devices/maca/*.cc"
,
"../src/infiniop/ops/*/m
aca
/*.cc"
)
add_files
(
"../src/infiniop/devices/maca/*.cc"
,
"../src/infiniop/ops/*/m
etax
/*.cc"
)
add_files
(
"../src/infiniop/ops/*/m
aca
/*.maca"
,
{
rule
=
"maca"
})
add_files
(
"../src/infiniop/ops/*/m
etax
/*.maca"
,
{
rule
=
"maca"
})
target_end
()
target_end
()
target
(
"infinirt-metax"
)
target
(
"infinirt-metax"
)
...
@@ -61,5 +61,5 @@ target("infiniccl-metax")
...
@@ -61,5 +61,5 @@ target("infiniccl-metax")
add_files
(
"../src/infiniccl/maca/*.cc"
)
add_files
(
"../src/infiniccl/maca/*.cc"
)
end
end
set_languages
(
"cxx17"
)
set_languages
(
"cxx17"
)
target_end
()
target_end
()
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment