Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
abf1e021
Commit
abf1e021
authored
Jul 10, 2025
by
YdrMaster
Browse files
issue/291/refactor: 改造 rms_norm、rope、swiglu
Signed-off-by:
YdrMaster
<
ydrml@hotmail.com
>
parent
f06eb359
Changes
12
Show whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
68 additions
and
72 deletions
+68
-72
src/infiniop/ops/rms_norm/cuda/kernel.cuh
src/infiniop/ops/rms_norm/cuda/kernel.cuh
+0
-6
src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cu
src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cu
+9
-3
src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cuh
src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cuh
+1
-1
src/infiniop/ops/rms_norm/operator.cc
src/infiniop/ops/rms_norm/operator.cc
+29
-29
src/infiniop/ops/rope/cuda/kernel.cuh
src/infiniop/ops/rope/cuda/kernel.cuh
+0
-2
src/infiniop/ops/rope/nvidia/rope_nvidia.cu
src/infiniop/ops/rope/nvidia/rope_nvidia.cu
+6
-3
src/infiniop/ops/rope/nvidia/rope_nvidia.cuh
src/infiniop/ops/rope/nvidia/rope_nvidia.cuh
+1
-1
src/infiniop/ops/rope/operator.cc
src/infiniop/ops/rope/operator.cc
+5
-5
src/infiniop/ops/swiglu/cuda/kernel.cuh
src/infiniop/ops/swiglu/cuda/kernel.cuh
+0
-0
src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cu
src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cu
+7
-7
src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cuh
src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cuh
+1
-1
src/infiniop/ops/swiglu/operator.cc
src/infiniop/ops/swiglu/operator.cc
+9
-14
No files found.
src/infiniop/ops/rms_norm/cuda/
rms_norm_
kernel.cuh
→
src/infiniop/ops/rms_norm/cuda/kernel.cuh
View file @
abf1e021
#ifndef __RMS_NORM_CUDA_KERNEL_H__
#define __RMS_NORM_CUDA_KERNEL_H__
#include "../../../devices/cuda/cuda_kernel_common.cuh"
#include <cub/block/block_reduce.cuh>
#include "../../../reduce/cuda/reduce.cuh"
template
<
unsigned
int
BLOCK_SIZE
,
typename
Tdata
,
typename
Tweight
,
typename
Tcompute
>
INFINIOP_CUDA_KERNEL
rmsnormBlock
(
Tdata
*
__restrict__
y
,
...
...
src/infiniop/ops/rms_norm/
cud
a/rms_norm_
cud
a.cu
→
src/infiniop/ops/rms_norm/
nvidi
a/rms_norm_
nvidi
a.cu
View file @
abf1e021
#include "../../../devices/cuda/cuda_common.cuh"
#include "rms_norm_cuda.cuh"
#include "rms_norm_kernel.cuh"
#include "rms_norm_nvidia.cuh"
namespace
op
::
rms_norm
::
cuda
{
#include "../../../devices/cuda/cuda_kernel_common.cuh"
#include <cub/block/block_reduce.cuh>
#include "../../../reduce/cuda/reduce.cuh"
#include "../cuda/kernel.cuh"
namespace
op
::
rms_norm
::
nvidia
{
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
cuda
::
Handle
::
Internal
>
internal
;
...
...
src/infiniop/ops/rms_norm/
cud
a/rms_norm_
cud
a.cuh
→
src/infiniop/ops/rms_norm/
nvidi
a/rms_norm_
nvidi
a.cuh
View file @
abf1e021
...
...
@@ -3,6 +3,6 @@
#include "../rms_norm.h"
DESCRIPTOR
(
cud
a
)
DESCRIPTOR
(
nvidi
a
)
#endif
src/infiniop/ops/rms_norm/operator.cc
View file @
abf1e021
...
...
@@ -6,7 +6,7 @@
#include "cpu/rms_norm_cpu.h"
#endif
#ifdef ENABLE_NVIDIA_API
#include "
cud
a/rms_norm_
cud
a.cuh"
#include "
nvidi
a/rms_norm_
nvidi
a.cuh"
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/rms_norm_aclnn.h"
...
...
@@ -37,17 +37,17 @@ __C infiniStatus_t infiniopCreateRMSNormDescriptor(
y_desc, \
x_desc, \
w_desc, \
epsilon)
;
epsilon)
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
)
CREATE
(
INFINI_DEVICE_CPU
,
cpu
)
;
#endif
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cud
a
)
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidi
a
)
;
#endif
#ifdef ENABLE_KUNLUN_API
CREATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
)
CREATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
)
;
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
...
...
@@ -55,13 +55,13 @@ __C infiniStatus_t infiniopCreateRMSNormDescriptor(
}
#endif
#ifdef ENABLE_ASCEND_API
CREATE
(
INFINI_DEVICE_ASCEND
,
ascend
)
CREATE
(
INFINI_DEVICE_ASCEND
,
ascend
)
;
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
maca
)
CREATE
(
INFINI_DEVICE_METAX
,
maca
)
;
#endif
#ifdef ENABLE_MOORE_API
CREATE
(
INFINI_DEVICE_MOORE
,
musa
)
CREATE
(
INFINI_DEVICE_MOORE
,
musa
)
;
#endif
}
...
...
@@ -75,17 +75,17 @@ __C infiniStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t d
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::rms_norm::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
;
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
)
GET
(
INFINI_DEVICE_CPU
,
cpu
)
;
#endif
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
cud
a
)
GET
(
INFINI_DEVICE_NVIDIA
,
nvidi
a
)
;
#endif
#ifdef ENABLE_KUNLUN_API
GET
(
INFINI_DEVICE_KUNLUN
,
kunlun
)
GET
(
INFINI_DEVICE_KUNLUN
,
kunlun
)
;
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
...
...
@@ -93,13 +93,13 @@ __C infiniStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t d
}
#endif
#ifdef ENABLE_ASCEND_API
GET
(
INFINI_DEVICE_ASCEND
,
ascend
)
GET
(
INFINI_DEVICE_ASCEND
,
ascend
)
;
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
maca
)
GET
(
INFINI_DEVICE_METAX
,
maca
)
;
#endif
#ifdef ENABLE_MOORE_API
GET
(
INFINI_DEVICE_MOORE
,
musa
)
GET
(
INFINI_DEVICE_MOORE
,
musa
)
;
#endif
}
...
...
@@ -114,17 +114,17 @@ __C infiniStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *works
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<op::rms_norm::NAMESPACE::Descriptor *>(desc)->calculate( \
workspace, workspace_size, y, x, w, stream)
;
workspace, workspace_size, y, x, w, stream)
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
)
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
)
;
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cud
a
)
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidi
a
)
;
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
)
CALCULATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
)
;
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
...
...
@@ -132,13 +132,13 @@ __C infiniStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *works
}
#endif
#ifdef ENABLE_ASCEND_API
CALCULATE
(
INFINI_DEVICE_ASCEND
,
ascend
)
CALCULATE
(
INFINI_DEVICE_ASCEND
,
ascend
)
;
#endif
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
maca
)
CALCULATE
(
INFINI_DEVICE_METAX
,
maca
)
;
#endif
#ifdef ENABLE_MOORE_API
CALCULATE
(
INFINI_DEVICE_MOORE
,
musa
)
CALCULATE
(
INFINI_DEVICE_MOORE
,
musa
)
;
#endif
}
...
...
@@ -152,17 +152,17 @@ __C infiniStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t
#define DESTROY(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<op::rms_norm::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS
;
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
DESTROY
(
INFINI_DEVICE_CPU
,
cpu
)
DESTROY
(
INFINI_DEVICE_CPU
,
cpu
)
;
#endif
#ifdef ENABLE_NVIDIA_API
DESTROY
(
INFINI_DEVICE_NVIDIA
,
cud
a
)
DESTROY
(
INFINI_DEVICE_NVIDIA
,
nvidi
a
)
;
#endif
#ifdef ENABLE_KUNLUN_API
DESTROY
(
INFINI_DEVICE_KUNLUN
,
kunlun
)
DESTROY
(
INFINI_DEVICE_KUNLUN
,
kunlun
)
;
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
...
...
@@ -170,13 +170,13 @@ __C infiniStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t
}
#endif
#ifdef ENABLE_ASCEND_API
DESTROY
(
INFINI_DEVICE_ASCEND
,
ascend
)
DESTROY
(
INFINI_DEVICE_ASCEND
,
ascend
)
;
#endif
#ifdef ENABLE_METAX_API
DESTROY
(
INFINI_DEVICE_METAX
,
maca
)
DESTROY
(
INFINI_DEVICE_METAX
,
maca
)
;
#endif
#ifdef ENABLE_MOORE_API
DESTROY
(
INFINI_DEVICE_MOORE
,
musa
)
DESTROY
(
INFINI_DEVICE_MOORE
,
musa
)
;
#endif
}
...
...
src/infiniop/ops/rope/cuda/
rope_cuda_
kernel.cuh
→
src/infiniop/ops/rope/cuda/kernel.cuh
View file @
abf1e021
#ifndef __INFINIOP_ROPE_CUDA_KERNEL_CUH__
#define __INFINIOP_ROPE_CUDA_KERNEL_CUH__
#include "../../../devices/cuda/cuda_kernel_common.cuh"
template
<
typename
Tdata
,
typename
Tindex
,
typename
Tangle
>
INFINIOP_CUDA_KERNEL
ropeThreadPerItem
(
Tdata
*
y_
,
...
...
src/infiniop/ops/rope/
cud
a/rope_
cud
a.cu
→
src/infiniop/ops/rope/
nvidi
a/rope_
nvidi
a.cu
View file @
abf1e021
#include "../../../devices/cuda/cuda_common.cuh"
#include "rope_cuda.cuh"
#include "rope_cuda_kernel.cuh"
#include "rope_nvidia.cuh"
namespace
op
::
rope
::
cuda
{
#include "../../../devices/cuda/cuda_kernel_common.cuh"
#include "../cuda/kernel.cuh"
namespace
op
::
rope
::
nvidia
{
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
cuda
::
Handle
::
Internal
>
internal
;
...
...
src/infiniop/ops/rope/
cud
a/rope_
cud
a.cuh
→
src/infiniop/ops/rope/
nvidi
a/rope_
nvidi
a.cuh
View file @
abf1e021
...
...
@@ -3,6 +3,6 @@
#include "../rope.h"
DESCRIPTOR
(
cud
a
)
DESCRIPTOR
(
nvidi
a
)
#endif // __INFINIOP_ROPE_CUDA_H__
src/infiniop/ops/rope/operator.cc
View file @
abf1e021
...
...
@@ -6,7 +6,7 @@
#include "cpu/rope_cpu.h"
#endif
#ifdef ENABLE_NVIDIA_API
#include "
cud
a/rope_
cud
a.cuh"
#include "
nvidi
a/rope_
nvidi
a.cuh"
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/rope_ascend.h"
...
...
@@ -40,7 +40,7 @@ __C infiniStatus_t infiniopCreateRoPEDescriptor(
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cud
a
);
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidi
a
);
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
maca
);
...
...
@@ -81,7 +81,7 @@ __C infiniStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc,
GET
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
cud
a
);
GET
(
INFINI_DEVICE_NVIDIA
,
nvidi
a
);
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
maca
);
...
...
@@ -132,7 +132,7 @@ __C infiniStatus_t infiniopRoPE(
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cud
a
);
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidi
a
);
#endif
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
maca
);
...
...
@@ -178,7 +178,7 @@ infiniopDestroyRoPEDescriptor(infiniopRoPEDescriptor_t desc) {
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
cud
a
);
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidi
a
);
#endif
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
maca
);
...
...
src/infiniop/ops/swiglu/cuda/
swiglu_cuda_int
ern
a
l.cuh
→
src/infiniop/ops/swiglu/cuda/
k
ern
e
l.cuh
View file @
abf1e021
File moved
src/infiniop/ops/swiglu/
cud
a/swiglu_
cud
a.cu
→
src/infiniop/ops/swiglu/
nvidi
a/swiglu_
nvidi
a.cu
View file @
abf1e021
#include "swiglu_
cud
a.cuh"
#include "
swiglu_cuda_int
ern
a
l.cuh"
#include "swiglu_
nvidi
a.cuh"
#include "
../cuda/k
ern
e
l.cuh"
namespace
op
::
swiglu
::
cud
a
{
namespace
op
::
swiglu
::
nvidi
a
{
Descriptor
::~
Descriptor
()
=
default
;
...
...
@@ -42,13 +42,13 @@ infiniStatus_t Descriptor::calculate(
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
return
_device_info
->
calculate
<
256
,
SwiGLUOp
,
half
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
SwiGLUOp
,
half
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_BF16
:
return
_device_info
->
calculate
<
256
,
SwiGLUOp
,
__nv_bfloat16
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
SwiGLUOp
,
__nv_bfloat16
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F32
:
return
_device_info
->
calculate
<
256
,
SwiGLUOp
,
float
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
SwiGLUOp
,
float
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
256
,
SwiGLUOp
,
double
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
SwiGLUOp
,
double
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
...
...
src/infiniop/ops/swiglu/
cud
a/swiglu_
cud
a.cuh
→
src/infiniop/ops/swiglu/
nvidi
a/swiglu_
nvidi
a.cuh
View file @
abf1e021
...
...
@@ -3,6 +3,6 @@
#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
ELEMENTWISE_DESCRIPTOR
(
swiglu
,
cud
a
,
cuda
)
ELEMENTWISE_DESCRIPTOR
(
swiglu
,
nvidi
a
,
cuda
)
#endif // __SWIGLU_CUDA_API_H__
src/infiniop/ops/swiglu/operator.cc
View file @
abf1e021
...
...
@@ -6,7 +6,7 @@
#include "cpu/swiglu_cpu.h"
#endif
#ifdef ENABLE_NVIDIA_API
#include "
cud
a/swiglu_
cud
a.cuh"
#include "
nvidi
a/swiglu_
nvidi
a.cuh"
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/swiglu_kunlun.h"
...
...
@@ -40,7 +40,7 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cud
a
);
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidi
a
);
#endif
#ifdef ENABLE_KUNLUN_API
CREATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
...
...
@@ -83,17 +83,17 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::swiglu::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
;
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
)
GET
(
INFINI_DEVICE_CPU
,
cpu
)
;
#endif
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
cud
a
)
GET
(
INFINI_DEVICE_NVIDIA
,
nvidi
a
)
;
#endif
#ifdef ENABLE_KUNLUN_API
GET
(
INFINI_DEVICE_KUNLUN
,
kunlun
)
GET
(
INFINI_DEVICE_KUNLUN
,
kunlun
)
;
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
maca
);
...
...
@@ -104,12 +104,7 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
}
#endif
#ifdef ENABLE_ASCEND_API
GET
(
INFINI_DEVICE_ASCEND
,
ascend
)
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaGetSwiGLUWorkspaceSize
((
SwiGLUMacaDescriptor_t
)
desc
,
size
);
}
GET
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
...
...
@@ -143,7 +138,7 @@ __C infiniStatus_t infiniopSwiGLU(
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cud
a
);
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidi
a
);
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
...
...
@@ -189,7 +184,7 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
cud
a
);
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidi
a
);
#endif
#ifdef ENABLE_KUNLUN_API
DELETE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment