Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
0166515c
Unverified
Commit
0166515c
authored
Aug 07, 2025
by
PanZezhong1725
Committed by
GitHub
Aug 07, 2025
Browse files
Merge branch 'main' into issue/300
parents
f0300ff3
a23c4d13
Changes
175
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
519 additions
and
184 deletions
+519
-184
src/infiniop/ops/relu/nvidia/relu_nvidia.cu
src/infiniop/ops/relu/nvidia/relu_nvidia.cu
+80
-0
src/infiniop/ops/relu/nvidia/relu_nvidia.cuh
src/infiniop/ops/relu/nvidia/relu_nvidia.cuh
+12
-0
src/infiniop/ops/relu/operator.cc
src/infiniop/ops/relu/operator.cc
+170
-0
src/infiniop/ops/rms_norm/cuda/kernel.cuh
src/infiniop/ops/rms_norm/cuda/kernel.cuh
+2
-5
src/infiniop/ops/rms_norm/metax/rms_norm_metax.cuh
src/infiniop/ops/rms_norm/metax/rms_norm_metax.cuh
+8
-0
src/infiniop/ops/rms_norm/metax/rms_norm_metax.maca
src/infiniop/ops/rms_norm/metax/rms_norm_metax.maca
+43
-21
src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cu
src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cu
+33
-15
src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cuh
src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cuh
+1
-1
src/infiniop/ops/rms_norm/operator.cc
src/infiniop/ops/rms_norm/operator.cc
+47
-35
src/infiniop/ops/rope/cuda/kernel.cuh
src/infiniop/ops/rope/cuda/kernel.cuh
+4
-6
src/infiniop/ops/rope/maca/rope_maca.h
src/infiniop/ops/rope/maca/rope_maca.h
+0
-8
src/infiniop/ops/rope/maca/rope_maca_kernel.h
src/infiniop/ops/rope/maca/rope_maca_kernel.h
+0
-42
src/infiniop/ops/rope/metax/rope_metax.h
src/infiniop/ops/rope/metax/rope_metax.h
+8
-0
src/infiniop/ops/rope/metax/rope_metax.maca
src/infiniop/ops/rope/metax/rope_metax.maca
+34
-9
src/infiniop/ops/rope/nvidia/rope_nvidia.cu
src/infiniop/ops/rope/nvidia/rope_nvidia.cu
+33
-10
src/infiniop/ops/rope/nvidia/rope_nvidia.cuh
src/infiniop/ops/rope/nvidia/rope_nvidia.cuh
+1
-1
src/infiniop/ops/rope/operator.cc
src/infiniop/ops/rope/operator.cc
+30
-25
src/infiniop/ops/sub/cpu/sub_cpu.cc
src/infiniop/ops/sub/cpu/sub_cpu.cc
+3
-1
src/infiniop/ops/sub/cuda/kernel.cuh
src/infiniop/ops/sub/cuda/kernel.cuh
+2
-5
src/infiniop/ops/sub/metax/sub_metax.h
src/infiniop/ops/sub/metax/sub_metax.h
+8
-0
No files found.
src/infiniop/ops/relu/nvidia/relu_nvidia.cu
0 → 100644
View file @
0166515c
#ifdef ENABLE_NINETOOTHED
#include "../../../../../build/ninetoothed/relu.h"
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "relu_nvidia.cuh"
namespace
op
::
relu
::
nvidia
{
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
)
{
auto
handle
=
reinterpret_cast
<
device
::
nvidia
::
Handle
*>
(
handle_
);
auto
dtype
=
out_desc
->
dtype
();
const
auto
&
x_desc
=
input_desc_vec
.
at
(
0
);
const
auto
&
y_shape
=
out_desc
->
shape
();
const
auto
&
x_shape
=
x_desc
->
shape
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
,
INFINI_DTYPE_BF16
);
CHECK_SAME_SHAPE
(
y_shape
,
x_shape
);
// create CUDA elementwise descriptor
CREATE_ELEMENTWISE_CUDA_DESCRIPTOR
(
handle
,
dtype
,
out_desc
,
input_desc_vec
)
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
{
if
(
workspace_size
<
_workspace_size
)
{
return
INFINI_STATUS_INSUFFICIENT_WORKSPACE
;
}
const
auto
&
ndim
{
_info
.
getNdim
()};
const
auto
&
x_shape_
{
_info
.
getInputShape
(
0
)};
const
auto
&
x_strides_
{
_info
.
getInputStrides
(
0
)};
std
::
vector
<
uint64_t
>
x_shape_vec
{
x_shape_
,
x_shape_
+
ndim
};
std
::
vector
<
int64_t
>
x_strides_vec
{
x_strides_
,
x_strides_
+
ndim
};
auto
x_data
{
const_cast
<
void
*>
(
inputs
[
0
])};
auto
x_shape
{
x_shape_vec
.
data
()};
auto
x_strides
{
x_strides_vec
.
data
()};
const
NineToothedTensor
x
{
x_data
,
x_shape
,
x_strides
};
const
auto
&
y_shape_
{
_info
.
getOutputShape
()};
const
auto
&
y_strides_
{
_info
.
getOutputStrides
()};
std
::
vector
<
uint64_t
>
y_shape_vec
{
y_shape_
,
y_shape_
+
ndim
};
std
::
vector
<
int64_t
>
y_strides_vec
{
y_strides_
,
y_strides_
+
ndim
};
auto
y_data
{
output
};
auto
y_shape
{
y_shape_vec
.
data
()};
auto
y_strides
{
y_strides_vec
.
data
()};
const
NineToothedTensor
y
{
y_data
,
y_shape
,
y_strides
};
constexpr
auto
block_size
{
1024
};
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
case
INFINI_DTYPE_F32
:
case
INFINI_DTYPE_F64
:
case
INFINI_DTYPE_BF16
:
if
(
launch_relu
(
stream
,
x
,
y
,
ndim
,
_dtype
,
block_size
))
{
return
INFINI_STATUS_INTERNAL_ERROR
;
}
return
INFINI_STATUS_SUCCESS
;
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::relu::nvidia
#endif
src/infiniop/ops/relu/nvidia/relu_nvidia.cuh
0 → 100644
View file @
0166515c
#ifndef __RELU_NVIDIA_API_H__
#define __RELU_NVIDIA_API_H__
#ifdef ENABLE_NINETOOTHED
#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
ELEMENTWISE_DESCRIPTOR
(
relu
,
nvidia
)
#endif
#endif // __RELU_NVIDIA_API_H__
src/infiniop/ops/relu/operator.cc
0 → 100644
View file @
0166515c
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/relu.h"
#ifdef ENABLE_CPU_API
#include "cpu/relu_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#ifdef ENABLE_NINETOOTHED
#include "nvidia/relu_nvidia.cuh"
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_NINETOOTHED
#include "metax/relu_metax.h"
#endif
#endif
__C
infiniStatus_t
infiniopCreateReluDescriptor
(
infiniopHandle_t
handle
,
infiniopReluDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
)
{
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::relu::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::relu::NAMESPACE::Descriptor **>(desc_ptr), \
y_desc, \
{x_desc})
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#endif
#ifdef ENABLE_ILUVATAR_API
#ifdef ENABLE_NINETOOTHED
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_NINETOOTHED
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CREATE
}
__C
infiniStatus_t
infiniopGetReluWorkspaceSize
(
infiniopReluDescriptor_t
desc
,
size_t
*
size
)
{
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::relu::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
)
#endif
#endif
#ifdef ENABLE_ILUVATAR_API
#ifdef ENABLE_NINETOOTHED
GET
(
INFINI_DEVICE_ILUVATAR
,
nvidia
)
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_NINETOOTHED
GET
(
INFINI_DEVICE_METAX
,
metax
)
#endif
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef GET
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
__C
infiniStatus_t
infiniopRelu
(
infiniopReluDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
void
*
stream
)
{
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::relu::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, y, {x}, stream)
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#endif
#ifdef ENABLE_ILUVATAR_API
#ifdef ENABLE_NINETOOTHED
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_NINETOOTHED
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CALCULATE
}
__C
infiniStatus_t
infiniopDestroyReluDescriptor
(
infiniopReluDescriptor_t
desc
)
{
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::relu::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#endif
#ifdef ENABLE_ILUVATAR_API
#ifdef ENABLE_NINETOOTHED
DELETE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_NINETOOTHED
DELETE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef DELETE
}
src/infiniop/ops/rms_norm/cuda/
rms_norm_
kernel.cuh
→
src/infiniop/ops/rms_norm/cuda/kernel.cuh
View file @
0166515c
#ifndef __RMS_NORM_CUDA_KERNEL_H__
#define __RMS_NORM_CUDA_KERNEL_H__
#include "../../../devices/cuda/cuda_kernel_common.cuh"
#include "../../../reduce/cuda/reduce.cuh"
template
<
unsigned
int
BLOCK_SIZE
,
typename
Tdata
,
typename
Tweight
,
typename
Tcompute
>
INFINIOP_CUDA_KERNEL
rmsnormBlock
(
template
<
unsigned
int
BLOCK_SIZE
,
typename
Tcompute
,
typename
Tdata
,
typename
Tweight
>
__device__
void
rmsnormBlock
(
Tdata
*
__restrict__
y
,
ptrdiff_t
stride_y
,
const
Tdata
*
__restrict__
x
,
...
...
src/infiniop/ops/rms_norm/m
aca
/rms_norm_m
aca
.cuh
→
src/infiniop/ops/rms_norm/m
etax
/rms_norm_m
etax
.cuh
View file @
0166515c
#ifndef __RMS_NORM_M
ACA
_CUH__
#define __RMS_NORM_M
ACA
_CUH__
#ifndef __RMS_NORM_M
ETAX
_CUH__
#define __RMS_NORM_M
ETAX
_CUH__
#include "../rms_norm.h"
DESCRIPTOR
(
m
aca
)
DESCRIPTOR
(
m
etax
)
#endif
src/infiniop/ops/rms_norm/m
aca
/rms_norm_m
aca
.maca
→
src/infiniop/ops/rms_norm/m
etax
/rms_norm_m
etax
.maca
View file @
0166515c
#include "../../../devices/maca/common_maca.h"
#include "../cuda/rms_norm_kernel.cuh"
#include "rms_norm_maca.cuh"
#include "../../../devices/metax/metax_common.h"
#include "rms_norm_metax.cuh"
namespace op::rms_norm::maca {
#include "../../../devices/metax/metax_kernel_common.h"
#include <cub/block/block_reduce.cuh>
#include "../../../reduce/cuda/reduce.cuh"
#include "../cuda/kernel.cuh"
template <unsigned int BLOCK_SIZE, typename Tcompute, typename Tdata, typename Tweight>
INFINIOP_METAX_KERNEL rmsnormKernel(
Tdata *__restrict__ y,
ptrdiff_t stride_y,
const Tdata *__restrict__ x,
ptrdiff_t stride_x,
const Tweight *__restrict__ w,
size_t dim,
float epsilon) {
rmsnormBlock<BLOCK_SIZE, Tcompute>(y, stride_y, x, stride_x, w, dim, epsilon);
}
namespace op::rms_norm::metax {
struct Descriptor::Opaque {
std::shared_ptr<device::m
aca
::Handle::Internal> internal;
std::shared_ptr<device::m
etax
::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
...
...
@@ -29,7 +47,7 @@ infiniStatus_t Descriptor::create(
}
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::m
aca
::Handle *>(handle)->internal()},
new Opaque{reinterpret_cast<device::m
etax
::Handle *>(handle)->internal()},
std::move(info),
0,
handle->device, handle->device_id);
...
...
@@ -44,20 +62,24 @@ infiniStatus_t launchKernel(
const void *x, ptrdiff_t stride_x,
const void *w, infiniDtype_t wtype,
float epsilon,
hcStream_t
maca_
stream) {
#define LAUNCH_KERNEL(Tdata, Tweight, Tcompute) \
rmsnorm
Block
<BLOCK_SIZE, Tdata, Tweight
, Tcompute
><<<batch_size, BLOCK_SIZE, 0,
maca_
stream>>>( \
reinterpret_cast<Tdata *>(y), \
stride_y, \
reinterpret_cast<const Tdata *>(x), \
stride_x, \
reinterpret_cast<const Tweight *>(w), \
dim, \
hcStream_t stream) {
#define LAUNCH_KERNEL(Tdata, Tweight, Tcompute)
\
rmsnorm
Kernel
<BLOCK_SIZE,
Tcompute,
Tdata, Tweight><<<batch_size, BLOCK_SIZE, 0, stream>>>( \
reinterpret_cast<Tdata *>(y),
\
stride_y,
\
reinterpret_cast<const Tdata *>(x),
\
stride_x,
\
reinterpret_cast<const Tweight *>(w),
\
dim,
\
epsilon)
if (atype == INFINI_DTYPE_F16 && wtype == INFINI_DTYPE_F16) {
LAUNCH_KERNEL(half, half, float);
} else if (atype == INFINI_DTYPE_BF16 && wtype == INFINI_DTYPE_BF16) {
LAUNCH_KERNEL(__hpcc_bfloat16, __hpcc_bfloat16, float);
} else if (atype == INFINI_DTYPE_BF16 && wtype == INFINI_DTYPE_F32) {
LAUNCH_KERNEL(__hpcc_bfloat16, float, float);
} else if (atype == INFINI_DTYPE_F16 && wtype == INFINI_DTYPE_F32) {
LAUNCH_KERNEL(half, float, float);
} else if (atype == INFINI_DTYPE_F32 && wtype == INFINI_DTYPE_F32) {
...
...
@@ -74,7 +96,7 @@ infiniStatus_t launchKernel(
infiniStatus_t Descriptor::calculate(
void *workspace, size_t workspace_size,
void *y, const void *x, const void *w,
void *stream) const {
void *stream
_
) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
...
...
@@ -84,14 +106,14 @@ infiniStatus_t Descriptor::calculate(
auto stride_y = _info.y_strides[0];
auto dim = _info.dim();
uint32_t batch_size = static_cast<uint32_t>(_info.shape[0]);
auto
maca_
stream = reinterpret_cast<hcStream_t>(stream);
auto stream = reinterpret_cast<hcStream_t>(stream
_
);
// launch kernel with different block sizes
if (_opaque->internal->maxThreadsPerBlock() ==
CUDA
_BLOCK_SIZE_1024) {
CHECK_STATUS(launchKernel<
CUDA
_BLOCK_SIZE_1024>(batch_size, dim, y, _info.atype, stride_y, x, stride_x, w, _info.wtype, _info.epsilon,
maca_
stream));
if (_opaque->internal->maxThreadsPerBlock() ==
METAX
_BLOCK_SIZE_1024) {
CHECK_STATUS(launchKernel<
METAX
_BLOCK_SIZE_1024>(batch_size, dim, y, _info.atype, stride_y, x, stride_x, w, _info.wtype, _info.epsilon, stream));
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::rms_norm::m
aca
} // namespace op::rms_norm::m
etax
src/infiniop/ops/rms_norm/
cud
a/rms_norm_
cud
a.cu
→
src/infiniop/ops/rms_norm/
nvidi
a/rms_norm_
nvidi
a.cu
View file @
0166515c
#include "../../../devices/cuda/cuda_common.cuh"
#include "rms_norm_cuda.cuh"
#include "rms_norm_kernel.cuh"
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "rms_norm_nvidia.cuh"
namespace
op
::
rms_norm
::
cuda
{
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include <cub/block/block_reduce.cuh>
#include "../../../reduce/cuda/reduce.cuh"
#include "../cuda/kernel.cuh"
template
<
unsigned
int
BLOCK_SIZE
,
typename
Tcompute
,
typename
Tdata
,
typename
Tweight
>
INFINIOP_CUDA_KERNEL
rmsnormKernel
(
Tdata
*
__restrict__
y
,
ptrdiff_t
stride_y
,
const
Tdata
*
__restrict__
x
,
ptrdiff_t
stride_x
,
const
Tweight
*
__restrict__
w
,
size_t
dim
,
float
epsilon
)
{
rmsnormBlock
<
BLOCK_SIZE
,
Tcompute
>
(
y
,
stride_y
,
x
,
stride_x
,
w
,
dim
,
epsilon
);
}
namespace
op
::
rms_norm
::
nvidia
{
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
cud
a
::
Handle
::
Internal
>
internal
;
std
::
shared_ptr
<
device
::
nvidi
a
::
Handle
::
Internal
>
internal
;
};
Descriptor
::~
Descriptor
()
{
...
...
@@ -29,7 +47,7 @@ infiniStatus_t Descriptor::create(
}
*
desc_ptr
=
new
Descriptor
(
new
Opaque
{
reinterpret_cast
<
device
::
cud
a
::
Handle
*>
(
handle
)
->
internal
()},
new
Opaque
{
reinterpret_cast
<
device
::
nvidi
a
::
Handle
*>
(
handle
)
->
internal
()},
std
::
move
(
info
),
0
,
handle
->
device
,
handle
->
device_id
);
...
...
@@ -46,14 +64,14 @@ infiniStatus_t launchKernel(
float
epsilon
,
cudaStream_t
cuda_stream
)
{
#define LAUNCH_KERNEL(Tdata, Tweight, Tcompute) \
rmsnorm
Block
<BLOCK_SIZE, Tdata, Tweight
, Tcompute
><<<batch_size, BLOCK_SIZE, 0, cuda_stream>>>( \
reinterpret_cast<Tdata *>(y), \
stride_y, \
reinterpret_cast<const Tdata *>(x), \
stride_x, \
reinterpret_cast<const Tweight *>(w), \
dim, \
#define LAUNCH_KERNEL(Tdata, Tweight, Tcompute)
\
rmsnorm
Kernel
<BLOCK_SIZE,
Tcompute,
Tdata, Tweight><<<batch_size, BLOCK_SIZE, 0, cuda_stream>>>( \
reinterpret_cast<Tdata *>(y),
\
stride_y,
\
reinterpret_cast<const Tdata *>(x),
\
stride_x,
\
reinterpret_cast<const Tweight *>(w),
\
dim,
\
epsilon)
if
(
atype
==
INFINI_DTYPE_F16
&&
wtype
==
INFINI_DTYPE_F16
)
{
...
...
@@ -102,4 +120,4 @@ infiniStatus_t Descriptor::calculate(
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::rms_norm::
cud
a
}
// namespace op::rms_norm::
nvidi
a
src/infiniop/ops/rms_norm/
cud
a/rms_norm_
cud
a.cuh
→
src/infiniop/ops/rms_norm/
nvidi
a/rms_norm_
nvidi
a.cuh
View file @
0166515c
...
...
@@ -3,6 +3,6 @@
#include "../rms_norm.h"
DESCRIPTOR
(
cud
a
)
DESCRIPTOR
(
nvidi
a
)
#endif
src/infiniop/ops/rms_norm/operator.cc
View file @
0166515c
...
...
@@ -5,14 +5,14 @@
#ifdef ENABLE_CPU_API
#include "cpu/rms_norm_cpu.h"
#endif
#ifdef
ENABLE_CUDA
_API
#include "
cud
a/rms_norm_
cud
a.cuh"
#if
def
ined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR
_API
)
#include "
nvidi
a/rms_norm_
nvidi
a.cuh"
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/rms_norm_aclnn.h"
#endif
#ifdef ENABLE_METAX_API
#include "m
aca
/rms_norm_m
aca
.cuh"
#include "m
etax
/rms_norm_m
etax
.cuh"
#endif
#ifdef ENABLE_MOORE_API
#include "musa/rms_norm_musa.cuh"
...
...
@@ -37,17 +37,20 @@ __C infiniStatus_t infiniopCreateRMSNormDescriptor(
y_desc, \
x_desc, \
w_desc, \
epsilon)
;
epsilon)
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
)
CREATE
(
INFINI_DEVICE_CPU
,
cpu
)
;
#endif
#ifdef ENABLE_CUDA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cuda
)
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_KUNLUN_API
CREATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
)
CREATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
)
;
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
...
...
@@ -55,13 +58,13 @@ __C infiniStatus_t infiniopCreateRMSNormDescriptor(
}
#endif
#ifdef ENABLE_ASCEND_API
CREATE
(
INFINI_DEVICE_ASCEND
,
ascend
)
CREATE
(
INFINI_DEVICE_ASCEND
,
ascend
)
;
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
m
aca
)
CREATE
(
INFINI_DEVICE_METAX
,
m
etax
);
#endif
#ifdef ENABLE_MOORE_API
CREATE
(
INFINI_DEVICE_MOORE
,
musa
)
CREATE
(
INFINI_DEVICE_MOORE
,
musa
)
;
#endif
}
...
...
@@ -75,17 +78,20 @@ __C infiniStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t d
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::rms_norm::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
;
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
)
GET
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_
CUDA
_API
GET
(
INFINI_DEVICE_
NVIDIA
,
cud
a
)
#ifdef ENABLE_
ILUVATAR
_API
GET
(
INFINI_DEVICE_
ILUVATAR
,
nvidi
a
)
;
#endif
#ifdef ENABLE_KUNLUN_API
GET
(
INFINI_DEVICE_KUNLUN
,
kunlun
)
GET
(
INFINI_DEVICE_KUNLUN
,
kunlun
)
;
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
...
...
@@ -93,13 +99,13 @@ __C infiniStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t d
}
#endif
#ifdef ENABLE_ASCEND_API
GET
(
INFINI_DEVICE_ASCEND
,
ascend
)
GET
(
INFINI_DEVICE_ASCEND
,
ascend
)
;
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
m
aca
)
GET
(
INFINI_DEVICE_METAX
,
m
etax
);
#endif
#ifdef ENABLE_MOORE_API
GET
(
INFINI_DEVICE_MOORE
,
musa
)
GET
(
INFINI_DEVICE_MOORE
,
musa
)
;
#endif
}
...
...
@@ -114,17 +120,20 @@ __C infiniStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *works
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<op::rms_norm::NAMESPACE::Descriptor *>(desc)->calculate( \
workspace, workspace_size, y, x, w, stream)
;
workspace, workspace_size, y, x, w, stream)
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
)
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
)
;
#endif
#ifdef ENABLE_CUDA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cuda
)
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
)
CALCULATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
)
;
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
...
...
@@ -132,13 +141,13 @@ __C infiniStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *works
}
#endif
#ifdef ENABLE_ASCEND_API
CALCULATE
(
INFINI_DEVICE_ASCEND
,
ascend
)
CALCULATE
(
INFINI_DEVICE_ASCEND
,
ascend
)
;
#endif
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
m
aca
)
CALCULATE
(
INFINI_DEVICE_METAX
,
m
etax
);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE
(
INFINI_DEVICE_MOORE
,
musa
)
CALCULATE
(
INFINI_DEVICE_MOORE
,
musa
)
;
#endif
}
...
...
@@ -152,17 +161,20 @@ __C infiniStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t
#define DESTROY(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<op::rms_norm::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS
;
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
DESTROY
(
INFINI_DEVICE_CPU
,
cpu
)
DESTROY
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
DESTROY
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_
CUDA
_API
DESTROY
(
INFINI_DEVICE_
NVIDIA
,
cud
a
)
#ifdef ENABLE_
ILUVATAR
_API
DESTROY
(
INFINI_DEVICE_
ILUVATAR
,
nvidi
a
)
;
#endif
#ifdef ENABLE_KUNLUN_API
DESTROY
(
INFINI_DEVICE_KUNLUN
,
kunlun
)
DESTROY
(
INFINI_DEVICE_KUNLUN
,
kunlun
)
;
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
...
...
@@ -170,13 +182,13 @@ __C infiniStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t
}
#endif
#ifdef ENABLE_ASCEND_API
DESTROY
(
INFINI_DEVICE_ASCEND
,
ascend
)
DESTROY
(
INFINI_DEVICE_ASCEND
,
ascend
)
;
#endif
#ifdef ENABLE_METAX_API
DESTROY
(
INFINI_DEVICE_METAX
,
m
aca
)
DESTROY
(
INFINI_DEVICE_METAX
,
m
etax
);
#endif
#ifdef ENABLE_MOORE_API
DESTROY
(
INFINI_DEVICE_MOORE
,
musa
)
DESTROY
(
INFINI_DEVICE_MOORE
,
musa
)
;
#endif
}
...
...
src/infiniop/ops/rope/cuda/
rope_cuda_
kernel.cuh
→
src/infiniop/ops/rope/cuda/kernel.cuh
View file @
0166515c
#ifndef __INFINIOP_ROPE_CUDA_KERNEL_CUH__
#define __INFINIOP_ROPE_CUDA_KERNEL_CUH__
#include "../../../devices/cuda/cuda_kernel_common.cuh"
template
<
typename
Tdata
,
typename
Tindex
,
typename
Tangle
>
INFINIOP_CUDA_KERNEL
ropeThreadPerItem
(
__device__
void
ropeThreadPerItem
Block
(
Tdata
*
y_
,
const
Tdata
*
x_
,
const
Tindex
*
__restrict__
pos_ids
,
...
...
@@ -30,9 +28,9 @@ INFINIOP_CUDA_KERNEL ropeThreadPerItem(
Tangle
y0
=
x
.
x
*
cos__
-
x
.
y
*
sin__
,
y1
=
x
.
x
*
sin__
+
x
.
y
*
cos__
;
y
=
half2
(
y0
,
y1
);
}
else
if
constexpr
(
std
::
is_same
<
Tdata
,
__nv
_bfloat16
>::
value
)
{
auto
&
y
=
reinterpret_cast
<
__nv
_bfloat162
&>
(
y_
[
y_offset
+
2
*
i
]);
auto
&
x
=
reinterpret_cast
<
const
__nv
_bfloat162
&>
(
x_
[
x_offset
+
2
*
i
]);
}
else
if
constexpr
(
std
::
is_same
<
Tdata
,
cuda
_bfloat16
>::
value
)
{
auto
&
y
=
reinterpret_cast
<
cuda
_bfloat162
&>
(
y_
[
y_offset
+
2
*
i
]);
auto
&
x
=
reinterpret_cast
<
const
cuda
_bfloat162
&>
(
x_
[
x_offset
+
2
*
i
]);
Tangle
x0
=
__low2bfloat16
(
x
);
Tangle
x1
=
__high2bfloat16
(
x
);
...
...
src/infiniop/ops/rope/maca/rope_maca.h
deleted
100644 → 0
View file @
f0300ff3
#ifndef __INFINIOP_ROPE_MACA_H__
#define __INFINIOP_ROPE_MACA_H__
#include "../rope.h"
DESCRIPTOR
(
maca
)
#endif // __INFINIOP_ROPE_MACA_H__
src/infiniop/ops/rope/maca/rope_maca_kernel.h
deleted
100644 → 0
View file @
f0300ff3
#ifndef __INFINIOP_ROPE_MACA_KERNEL_H__
#define __INFINIOP_ROPE_MACA_KERNEL_H__
#include "../../../devices/maca/maca_kernel_common.h"
template
<
typename
Tdata
,
typename
Tindex
,
typename
Tangle
>
INFINIOP_MACA_KERNEL
ropeThreadPerItem
(
Tdata
*
y_
,
const
Tdata
*
x_
,
const
Tindex
*
__restrict__
pos_ids
,
const
Tangle
*
__restrict__
sin_table
,
const
Tangle
*
__restrict__
cos_table
,
size_t
table_dim
,
ptrdiff_t
y_stride_seqlen
,
ptrdiff_t
y_stride_nhead
,
ptrdiff_t
x_stride_seqlen
,
ptrdiff_t
x_stride_nhead
)
{
auto
y_offset
=
blockIdx
.
x
*
y_stride_seqlen
+
blockIdx
.
y
*
y_stride_nhead
;
auto
x_offset
=
blockIdx
.
x
*
x_stride_seqlen
+
blockIdx
.
y
*
x_stride_nhead
;
size_t
pos_id
=
size_t
(
pos_ids
[
blockIdx
.
x
]);
auto
table_offset
=
pos_id
*
table_dim
;
for
(
size_t
i
=
threadIdx
.
x
;
i
<
table_dim
;
i
+=
blockDim
.
x
)
{
Tangle
sin__
=
sin_table
[
table_offset
+
i
],
cos__
=
cos_table
[
table_offset
+
i
];
if
constexpr
(
std
::
is_same
<
Tdata
,
half
>::
value
)
{
auto
&
y
=
reinterpret_cast
<
half2
&>
(
y_
[
y_offset
+
2
*
i
]);
auto
&
x
=
reinterpret_cast
<
const
half2
&>
(
x_
[
x_offset
+
2
*
i
]);
Tangle
y0
=
x
.
x
*
cos__
-
x
.
y
*
sin__
,
y1
=
x
.
x
*
sin__
+
x
.
y
*
cos__
;
y
=
half2
(
y0
,
y1
);
}
else
{
Tangle
x0
=
x_
[
x_offset
+
2
*
i
],
x1
=
x_
[
x_offset
+
2
*
i
+
1
];
y_
[
y_offset
+
2
*
i
]
=
Tdata
(
x0
*
cos__
-
x1
*
sin__
);
y_
[
y_offset
+
2
*
i
+
1
]
=
Tdata
(
x0
*
sin__
+
x1
*
cos__
);
}
}
}
#endif
src/infiniop/ops/rope/metax/rope_metax.h
0 → 100644
View file @
0166515c
#ifndef __INFINIOP_ROPE_METAX_H__
#define __INFINIOP_ROPE_METAX_H__
#include "../rope.h"
DESCRIPTOR
(
metax
)
#endif // __INFINIOP_ROPE_METAX_H__
src/infiniop/ops/rope/m
aca
/rope_m
aca
.maca
→
src/infiniop/ops/rope/m
etax
/rope_m
etax
.maca
View file @
0166515c
#include "../../../devices/maca/common_maca.h"
#include "rope_maca.h"
#include "rope_maca_kernel.h"
#include "../../../devices/metax/metax_common.h"
#include "rope_metax.h"
#include "../../../devices/metax/metax_kernel_common.h"
#include "../cuda/kernel.cuh"
template <typename Tdata, typename Tindex, typename Tangle>
INFINIOP_METAX_KERNEL ropeThreadPerItemKernel(
Tdata *y_,
const Tdata *x_,
const Tindex *__restrict__ pos_ids,
const Tangle *__restrict__ sin_table,
const Tangle *__restrict__ cos_table,
size_t table_dim,
ptrdiff_t y_stride_seqlen,
ptrdiff_t y_stride_nhead,
ptrdiff_t x_stride_seqlen,
ptrdiff_t x_stride_nhead) {
ropeThreadPerItemBlock(
y_, x_, pos_ids,
sin_table, cos_table,
table_dim,
y_stride_seqlen, y_stride_nhead,
x_stride_seqlen, x_stride_nhead);
}
namespace op::rope::m
aca
{
namespace op::rope::m
etax
{
struct Descriptor::Opaque {
std::shared_ptr<device::m
aca
::Handle::Internal> internal;
std::shared_ptr<device::m
etax
::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
...
...
@@ -21,7 +44,7 @@ infiniStatus_t Descriptor::create(
infiniopTensorDescriptor_t sin_desc,
infiniopTensorDescriptor_t cos_desc) {
auto handle = reinterpret_cast<device::m
aca
::Handle *>(handle_);
auto handle = reinterpret_cast<device::m
etax
::Handle *>(handle_);
auto info = RoPEInfo::createRoPEInfo(y_desc, x_desc, pos_desc, sin_desc, cos_desc);
CHECK_RESULT(info);
...
...
@@ -30,7 +53,7 @@ infiniStatus_t Descriptor::create(
*desc_ptr = new Descriptor(
info.take(),
0,
new Opaque{reinterpret_cast<device::m
aca
::Handle *>(handle)->internal()},
new Opaque{reinterpret_cast<device::m
etax
::Handle *>(handle)->internal()},
handle->device,
handle->device_id);
...
...
@@ -50,7 +73,7 @@ infiniStatus_t calculateRoPE(const RoPEInfo &info,
dimy = uint32_t(info.nhead);
int nthreads = std::max(int(info.table_dim), block_size);
ropeThreadPerItem<<<dim3(dimx, dimy), nthreads, 0, stream>>>(
ropeThreadPerItem
Kernel
<<<dim3(dimx, dimy), nthreads, 0, stream>>>(
y, x, pos_ids, sin_table, cos_table, info.table_dim,
info.y_stride_seqlen, info.y_stride_nhead, info.x_stride_seqlen, info.x_stride_nhead);
...
...
@@ -102,6 +125,8 @@ infiniStatus_t Descriptor::calculate(
switch (_info.data_type) {
case INFINI_DTYPE_F16:
ROPE_TYPE(half);
case INFINI_DTYPE_BF16:
ROPE_TYPE(cuda_bfloat16);
case INFINI_DTYPE_F32:
ROPE_TYPE(float);
case INFINI_DTYPE_F64:
...
...
@@ -116,4 +141,4 @@ infiniStatus_t Descriptor::calculate(
#undef ROPE_TYPE
#undef CALCULATE_ROPE
} // namespace op::rope::m
aca
} // namespace op::rope::m
etax
src/infiniop/ops/rope/
cud
a/rope_
cud
a.cu
→
src/infiniop/ops/rope/
nvidi
a/rope_
nvidi
a.cu
View file @
0166515c
#include "../../../devices/cuda/cuda_common.cuh"
#include "rope_cuda.cuh"
#include "rope_cuda_kernel.cuh"
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "rope_nvidia.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include "../cuda/kernel.cuh"
template
<
typename
Tdata
,
typename
Tindex
,
typename
Tangle
>
INFINIOP_CUDA_KERNEL
ropeThreadPerItemKernel
(
Tdata
*
y_
,
const
Tdata
*
x_
,
const
Tindex
*
__restrict__
pos_ids
,
const
Tangle
*
__restrict__
sin_table
,
const
Tangle
*
__restrict__
cos_table
,
size_t
table_dim
,
ptrdiff_t
y_stride_seqlen
,
ptrdiff_t
y_stride_nhead
,
ptrdiff_t
x_stride_seqlen
,
ptrdiff_t
x_stride_nhead
)
{
ropeThreadPerItemBlock
(
y_
,
x_
,
pos_ids
,
sin_table
,
cos_table
,
table_dim
,
y_stride_seqlen
,
y_stride_nhead
,
x_stride_seqlen
,
x_stride_nhead
);
}
namespace
op
::
rope
::
cud
a
{
namespace
op
::
rope
::
nvidi
a
{
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
cud
a
::
Handle
::
Internal
>
internal
;
std
::
shared_ptr
<
device
::
nvidi
a
::
Handle
::
Internal
>
internal
;
};
Descriptor
::~
Descriptor
()
{
...
...
@@ -21,7 +44,7 @@ infiniStatus_t Descriptor::create(
infiniopTensorDescriptor_t
sin_desc
,
infiniopTensorDescriptor_t
cos_desc
)
{
auto
handle
=
reinterpret_cast
<
device
::
cud
a
::
Handle
*>
(
handle_
);
auto
handle
=
reinterpret_cast
<
device
::
nvidi
a
::
Handle
*>
(
handle_
);
auto
info
=
RoPEInfo
::
createRoPEInfo
(
y_desc
,
x_desc
,
pos_desc
,
sin_desc
,
cos_desc
);
CHECK_RESULT
(
info
);
...
...
@@ -30,7 +53,7 @@ infiniStatus_t Descriptor::create(
*
desc_ptr
=
new
Descriptor
(
info
.
take
(),
0
,
new
Opaque
{
reinterpret_cast
<
device
::
cud
a
::
Handle
*>
(
handle
)
->
internal
()},
new
Opaque
{
reinterpret_cast
<
device
::
nvidi
a
::
Handle
*>
(
handle
)
->
internal
()},
handle
->
device
,
handle
->
device_id
);
...
...
@@ -50,7 +73,7 @@ infiniStatus_t calculateRoPE(const RoPEInfo &info,
dimy
=
uint32_t
(
info
.
nhead
);
int
nthreads
=
std
::
max
(
int
(
info
.
table_dim
),
block_size
);
ropeThreadPerItem
<<<
dim3
(
dimx
,
dimy
),
nthreads
,
0
,
stream
>>>
(
ropeThreadPerItem
Kernel
<<<
dim3
(
dimx
,
dimy
),
nthreads
,
0
,
stream
>>>
(
y
,
x
,
pos_ids
,
sin_table
,
cos_table
,
info
.
table_dim
,
info
.
y_stride_seqlen
,
info
.
y_stride_nhead
,
info
.
x_stride_seqlen
,
info
.
x_stride_nhead
);
...
...
@@ -103,7 +126,7 @@ infiniStatus_t Descriptor::calculate(
case
INFINI_DTYPE_F16
:
ROPE_TYPE
(
half
);
case
INFINI_DTYPE_BF16
:
ROPE_TYPE
(
__nv
_bfloat16
);
ROPE_TYPE
(
cuda
_bfloat16
);
case
INFINI_DTYPE_F32
:
ROPE_TYPE
(
float
);
case
INFINI_DTYPE_F64
:
...
...
@@ -118,4 +141,4 @@ infiniStatus_t Descriptor::calculate(
#undef ROPE_TYPE
#undef CALCULATE_ROPE
}
// namespace op::rope::
cud
a
}
// namespace op::rope::
nvidi
a
src/infiniop/ops/rope/
cud
a/rope_
cud
a.cuh
→
src/infiniop/ops/rope/
nvidi
a/rope_
nvidi
a.cuh
View file @
0166515c
...
...
@@ -3,6 +3,6 @@
#include "../rope.h"
DESCRIPTOR
(
cud
a
)
DESCRIPTOR
(
nvidi
a
)
#endif // __INFINIOP_ROPE_CUDA_H__
src/infiniop/ops/rope/operator.cc
View file @
0166515c
...
...
@@ -5,14 +5,14 @@
#ifdef ENABLE_CPU_API
#include "cpu/rope_cpu.h"
#endif
#ifdef
ENABLE_CUDA
_API
#include "
cud
a/rope_
cud
a.cuh"
#if
def
ined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR
_API
)
#include "
nvidi
a/rope_
nvidi
a.cuh"
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/rope_ascend.h"
#endif
#ifdef ENABLE_METAX_API
#include "m
aca
/rope_m
aca
.h"
#include "m
etax
/rope_m
etax
.h"
#endif
__C
infiniStatus_t
infiniopCreateRoPEDescriptor
(
...
...
@@ -39,11 +39,17 @@ __C infiniStatus_t infiniopCreateRoPEDescriptor(
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
maca
);
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_ASCEND_API
CREATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
...
...
@@ -52,16 +58,6 @@ __C infiniStatus_t infiniopCreateRoPEDescriptor(
pos_ids
,
sin_table
,
cos_table
);
}
#endif
#ifdef ENABLE_ASCEND_API
CREATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
return
macaCreateRoPEDescriptor
((
MacaHandle_t
)
handle
,
(
RoPEMacaDescriptor_t
*
)
desc_ptr
,
t
,
pos_ids
,
sin_table
,
cos_table
);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case
DevMthreadsGpu
:
{
return
musaCreateRoPEDescriptor
((
MusaHandle_t
)
handle
,
...
...
@@ -87,11 +83,14 @@ __C infiniStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc,
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
GET
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
GET
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
m
aca
);
GET
(
INFINI_DEVICE_METAX
,
m
etax
);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
...
...
@@ -138,11 +137,14 @@ __C infiniStatus_t infiniopRoPE(
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
m
aca
);
CALCULATE
(
INFINI_DEVICE_METAX
,
m
etax
);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
...
...
@@ -184,11 +186,14 @@ infiniopDestroyRoPEDescriptor(infiniopRoPEDescriptor_t desc) {
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#ifdef ENABLE_NVIDIA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
m
aca
);
DELETE
(
INFINI_DEVICE_METAX
,
m
etax
);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
...
...
src/infiniop/ops/sub/cpu/sub_cpu.cc
View file @
0166515c
...
...
@@ -19,7 +19,7 @@ infiniStatus_t Descriptor::create(
const
auto
&
a_shape
=
a_desc
->
shape
();
const
auto
&
b_shape
=
b_desc
->
shape
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
);
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
,
INFINI_DTYPE_BF16
);
CHECK_SAME_SHAPE
(
c_shape
,
a_shape
,
b_shape
);
...
...
@@ -43,6 +43,8 @@ infiniStatus_t Descriptor::calculate(
return
_device_info
->
calculate
<
SubOp
,
float
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
SubOp
,
double
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_BF16
:
return
_device_info
->
calculate
<
SubOp
,
bf16_t
>
(
_info
,
output
,
inputs
,
stream
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
...
...
src/infiniop/ops/sub/cuda/
sub_cuda_int
ern
a
l.cuh
→
src/infiniop/ops/sub/cuda/
k
ern
e
l.cuh
View file @
0166515c
#ifndef __SUB_CUDA_H__
#define __SUB_CUDA_H__
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include <cuda_fp16.h>
namespace
op
::
sub
::
cuda
{
typedef
struct
SubOp
{
public:
static
constexpr
size_t
num_inputs
=
2
;
template
<
typename
T
>
__device__
__forceinline__
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
)
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
||
std
::
is_same_v
<
T
,
cuda_bfloat162
>
)
{
return
__hsub2
(
a
,
b
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
||
std
::
is_same_v
<
T
,
cuda_bfloat16
>
)
{
return
__hsub
(
a
,
b
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
return
__fsub_rd
(
a
,
b
);
...
...
src/infiniop/ops/sub/metax/sub_metax.h
0 → 100644
View file @
0166515c
#ifndef __SUB_METAX_API_H__
#define __SUB_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR
(
sub
,
metax
)
#endif // __SUB_METAX_API_H__
Prev
1
2
3
4
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment