Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
784139b9
Unverified
Commit
784139b9
authored
Feb 13, 2026
by
thatPepe
Committed by
GitHub
Feb 13, 2026
Browse files
Merge pull request #990 from InfiniTensor/demo131
Demo-131 Cuda graph with optimized paged attention
parents
3c8fb3c0
1d6527cb
Changes
582
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
758 additions
and
29 deletions
+758
-29
src/infiniop/ops/scaled_mm/nvidia/int8_gemm_kernel.cuh
src/infiniop/ops/scaled_mm/nvidia/int8_gemm_kernel.cuh
+0
-11
src/infiniop/ops/scaled_mm/nvidia/int8_gemm_nvidia.cu
src/infiniop/ops/scaled_mm/nvidia/int8_gemm_nvidia.cu
+95
-4
src/infiniop/ops/scaled_mm/operator.cc
src/infiniop/ops/scaled_mm/operator.cc
+33
-5
src/infiniop/ops/sigmoid/operator.cc
src/infiniop/ops/sigmoid/operator.cc
+25
-3
src/infiniop/ops/silu/operator.cc
src/infiniop/ops/silu/operator.cc
+14
-1
src/infiniop/ops/silu_and_mul/info.h
src/infiniop/ops/silu_and_mul/info.h
+54
-0
src/infiniop/ops/silu_and_mul/moore/silu_and_mul_moore.h
src/infiniop/ops/silu_and_mul/moore/silu_and_mul_moore.h
+8
-0
src/infiniop/ops/silu_and_mul/moore/silu_and_mul_moore.mu
src/infiniop/ops/silu_and_mul/moore/silu_and_mul_moore.mu
+123
-0
src/infiniop/ops/silu_and_mul/operator.cc
src/infiniop/ops/silu_and_mul/operator.cc
+79
-0
src/infiniop/ops/silu_and_mul/silu_and_mul.h
src/infiniop/ops/silu_and_mul/silu_and_mul.h
+46
-0
src/infiniop/ops/softmax/nvidia/softmax_nvidia.cu
src/infiniop/ops/softmax/nvidia/softmax_nvidia.cu
+3
-0
src/infiniop/ops/softmax/operator.cc
src/infiniop/ops/softmax/operator.cc
+13
-1
src/infiniop/ops/softplus/operator.cc
src/infiniop/ops/softplus/operator.cc
+17
-1
src/infiniop/ops/sub/operator.cc
src/infiniop/ops/sub/operator.cc
+13
-1
src/infiniop/ops/swiglu/ninetoothed/build.py
src/infiniop/ops/swiglu/ninetoothed/build.py
+29
-0
src/infiniop/ops/swiglu/ninetoothed/swiglu.h
src/infiniop/ops/swiglu/ninetoothed/swiglu.h
+82
-0
src/infiniop/ops/swiglu/ninetoothed/swiglu.py
src/infiniop/ops/swiglu/ninetoothed/swiglu.py
+22
-0
src/infiniop/ops/swiglu/operator.cc
src/infiniop/ops/swiglu/operator.cc
+69
-1
src/infiniop/ops/tanh/operator.cc
src/infiniop/ops/tanh/operator.cc
+17
-1
src/infiniop/ops/topkrouter/cuda/kernel.cuh
src/infiniop/ops/topkrouter/cuda/kernel.cuh
+16
-0
No files found.
src/infiniop/ops/scaled_mm/nvidia/int8_gemm_kernel.cuh
View file @
784139b9
...
...
@@ -140,20 +140,9 @@ void cutlass_int8_scaled_mm(
typename
Gemm
::
Arguments
args
{
{
m
,
n
,
k
},
{
a_ptr
,
lda
},
{
b_ptr
,
ldb
},
{
b_s_ptr
,
0
},
{
a_s_ptr
,
0
},
{
bias_ptr
,
ldc
},
{
o_ptr
,
ldd
},
visitor_args
};
/* 需要先看看是否需要workspace */
// auto workspace = torch::empty(
// gemm_op.get_workspace_size(args), torch::TensorOptions().dtype(torch::kUInt8).device(mat_a.device()));
// auto can_implement = gemm_op.can_implement(args);
check_cutlass_status
(
gemm_op
.
can_implement
(
args
));
// TORCH_CHECK(
// can_implement == cutlass::Status::kSuccess,
// "gemm cannot implement, error: ",
// cutlassGetStatusString(can_implement));
auto
status
=
gemm_op
(
args
,
nullptr
,
(
cudaStream_t
)
stream
);
check_cutlass_status
(
status
);
// TORCH_CHECK(status == cutlass::Status::kSuccess, "gemm executioin failed, error: ", cutlassGetStatusString(status));
}
template
<
typename
ElementOutput
,
typename
ArchTag
,
typename
InstructionShape
>
...
...
src/infiniop/ops/scaled_mm/nvidia/int8_gemm_nvidia.cu
View file @
784139b9
#ifdef ENABLE_CUTLASS_API
#include "../../../devices/nvidia/nvidia_handle.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#ifdef ENABLE_CUTLASS_API
#include "int8_gemm_kernel.cuh"
#endif
#include "../cuda/per_channel_dequant_int8.cuh"
#include "int8_gemm_nvidia.cuh"
template
<
typename
Tdata
>
INFINIOP_CUDA_KERNEL
postSym
(
Tdata
*
y
,
int32_t
*
y_packed
,
const
Tdata
*
bias
,
const
int8_t
*
x_packed
,
const
float
*
x_scale
,
const
int8_t
*
w_packed
,
const
float
*
w_scale
,
int
M
,
int
K
,
int
N
)
{
postSymKernel
<
Tdata
>
(
y
,
y_packed
,
bias
,
x_packed
,
x_scale
,
w_packed
,
w_scale
,
M
,
K
,
N
);
}
template
<
typename
Tdata
>
INFINIOP_CUDA_KERNEL
postSym
(
Tdata
*
y
,
int32_t
*
y_packed
,
const
int8_t
*
x_packed
,
const
float
*
x_scale
,
const
int8_t
*
w_packed
,
const
float
*
w_scale
,
int
M
,
int
K
,
int
N
)
{
postSymKernel
<
Tdata
>
(
y
,
y_packed
,
x_packed
,
x_scale
,
w_packed
,
w_scale
,
M
,
K
,
N
);
}
namespace
op
::
i8gemm
::
nvidia
{
struct
Descriptor
::
Opaque
{
...
...
@@ -14,6 +28,7 @@ Descriptor::~Descriptor() {
delete
_opaque
;
}
#ifdef ENABLE_NVIDIA_API
inline
int
getSMVersion
()
{
int
device
{
-
1
};
CHECK_CUDA
(
cudaGetDevice
(
&
device
));
...
...
@@ -23,6 +38,7 @@ inline int getSMVersion() {
CHECK_CUDA
(
cudaDeviceGetAttribute
(
&
sm_minor
,
cudaDevAttrComputeCapabilityMinor
,
device
));
return
sm_major
*
10
+
sm_minor
;
}
#endif
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
...
...
@@ -40,14 +56,65 @@ infiniStatus_t Descriptor::create(
auto
result
=
I8GemmInfo
::
create
(
out_desc
,
a_desc
,
b_desc
,
MatrixLayout
::
COL_MAJOR
);
CHECK_RESULT
(
result
);
size_t
workspace_size
=
out_desc
->
dim
(
0
)
*
out_desc
->
dim
(
1
)
*
sizeof
(
int32_t
);
*
desc_ptr
=
new
Descriptor
(
new
Opaque
{
handle
->
internal
()},
result
.
take
(),
0
,
dtype
,
result
.
take
(),
workspace_size
,
dtype
,
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
#ifdef ENABLE_QY_API
template
<
unsigned
int
BLOCK_SIZE
,
typename
Tdata
>
infiniStatus_t
Descriptor
::
launchKernel
(
const
I8GemmInfo
&
info
,
Tdata
*
y
,
const
Tdata
*
bias
,
const
int8_t
*
x_packed
,
const
float
*
x_scale
,
const
int8_t
*
w_packed
,
const
float
*
w_scale
,
void
*
stream_
,
void
*
workspace
)
const
{
cudaStream_t
stream
=
(
cudaStream_t
)
stream_
;
int
M
=
(
int
)
info
.
m
;
int
K
=
(
int
)
info
.
k
;
int
N
=
(
int
)
info
.
n
;
char
*
workspace_ptr
=
reinterpret_cast
<
char
*>
(
workspace
);
int32_t
*
y_packed
=
reinterpret_cast
<
int32_t
*>
(
workspace_ptr
);
const
int32_t
alpha_I
=
1
;
const
int32_t
beta_I
=
0
;
int
lda
=
K
;
// w_packed is column-major [K, N]
int
ldb
=
K
;
// x_packed is row-major [M, K]
int
ldc
=
N
;
// y_packed is row-major [M, N]
CHECK_STATUS
(
this
->
_opaque
->
internal
->
useCublas
(
stream
,
[
&
](
cublasHandle_t
handle
)
{
CHECK_CUBLAS
(
cublasGemmEx
(
handle
,
CUBLAS_OP_T
,
// A = w_packed^T : [N, K]
CUBLAS_OP_N
,
// B = x_packed^T viewed column-major : [K, M]
N
,
// m
M
,
// n
K
,
// k
&
alpha_I
,
w_packed
,
CUDA_R_8I
,
lda
,
x_packed
,
CUDA_R_8I
,
ldb
,
&
beta_I
,
y_packed
,
CUDA_R_32I
,
ldc
,
CUBLAS_COMPUTE_32I
,
CUBLAS_GEMM_DEFAULT
));
return
INFINI_STATUS_SUCCESS
;
}));
constexpr
unsigned
int
BLOCK_SIZE_x
=
32
;
constexpr
unsigned
int
BLOCK_SIZE_y
=
32
;
int
num_block_x
=
(
N
+
BLOCK_SIZE_x
-
1
)
/
BLOCK_SIZE_x
;
int
num_block_y
=
(
M
+
BLOCK_SIZE_y
-
1
)
/
BLOCK_SIZE_y
;
dim3
block_dim
(
BLOCK_SIZE_x
,
BLOCK_SIZE_y
,
1
);
dim3
grid_dim
(
num_block_x
,
num_block_y
,
1
);
if
(
bias
==
nullptr
)
{
postSym
<
Tdata
><<<
grid_dim
,
block_dim
,
0
,
stream
>>>
(
y
,
y_packed
,
x_packed
,
x_scale
,
w_packed
,
w_scale
,
M
,
K
,
N
);
}
else
{
postSym
<
Tdata
><<<
grid_dim
,
block_dim
,
0
,
stream
>>>
(
y
,
y_packed
,
bias
,
x_packed
,
x_scale
,
w_packed
,
w_scale
,
M
,
K
,
N
);
}
return
INFINI_STATUS_SUCCESS
;
}
#endif
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
...
...
@@ -58,6 +125,7 @@ infiniStatus_t Descriptor::calculate(
const
void
*
b
,
const
void
*
b_scale
,
void
*
stream
)
const
{
#if defined(ENABLE_NVIDIA_API) && defined(ENABLE_CUTLASS_API)
auto
sm_version
=
getSMVersion
();
if
(
sm_version
>=
75
&&
sm_version
<
80
)
{
CHECK_DTYPE
(
this
->
_out_dtype
,
INFINI_DTYPE_F16
);
...
...
@@ -111,7 +179,30 @@ infiniStatus_t Descriptor::calculate(
}
else
{
return
INFINI_STATUS_NOT_IMPLEMENTED
;
}
#elif defined ENABLE_QY_API
#define CALCULATE_LINEAR(BLOCK_SIZE, TDATA) \
launchKernel<BLOCK_SIZE, TDATA>(_info, (TDATA *)out, (const TDATA *)bias, (const int8_t *)a, (const float *)a_scale, (const int8_t *)b, (const float *)b_scale, stream, workspace)
#define CALCULATE_LINEAR_WITH_BLOCK_SIZE(BLOCK_SIZE) \
{ \
if (this->_out_dtype == INFINI_DTYPE_F16) \
return CALCULATE_LINEAR(BLOCK_SIZE, half); \
else if (this->_out_dtype == INFINI_DTYPE_F32) \
return CALCULATE_LINEAR(BLOCK_SIZE, float); \
else if (this->_out_dtype == INFINI_DTYPE_BF16) \
return CALCULATE_LINEAR(BLOCK_SIZE, __nv_bfloat16); \
else \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_1024
)
{
CALCULATE_LINEAR_WITH_BLOCK_SIZE
(
CUDA_BLOCK_SIZE_1024
)
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_512
)
{
CALCULATE_LINEAR_WITH_BLOCK_SIZE
(
CUDA_BLOCK_SIZE_512
)
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_4096
)
{
CALCULATE_LINEAR_WITH_BLOCK_SIZE
(
CUDA_BLOCK_SIZE_4096
)
}
else
{
return
INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED
;
}
#endif
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::i8gemm::nvidia
#endif
\ No newline at end of file
src/infiniop/ops/scaled_mm/operator.cc
View file @
784139b9
...
...
@@ -2,10 +2,14 @@
#include "../../handle.h"
#include "infiniop/ops/int8_gemm.h"
#if defined(ENABLE_NVIDIA_API)
&&
defined(ENABLE_
CUTLASS
_API)
#if defined(ENABLE_NVIDIA_API)
||
defined(ENABLE_
QY
_API)
#include "nvidia/int8_gemm_nvidia.cuh"
#endif
#if defined(ENABLE_MOORE_API)
#include "moore/int8_gemm_moore.h"
#endif
__C
infiniStatus_t
infiniopCreateI8GemmDescriptor
(
infiniopHandle_t
handle
,
infiniopI8GemmDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
...
...
@@ -26,8 +30,14 @@ __C infiniStatus_t infiniopCreateI8GemmDescriptor(infiniopHandle_t handle,
b_desc, \
b_scale_desc);
switch
(
handle
->
device
)
{
#if defined(ENABLE_NVIDIA_API)
&& defined(ENABLE_CUTLASS_API)
#if defined(ENABLE_NVIDIA_API)
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
)
#endif
#if defined(ENABLE_QY_API)
CREATE
(
INFINI_DEVICE_QY
,
nvidia
)
#endif
#if defined(ENABLE_MOORE_API)
CREATE
(
INFINI_DEVICE_MOORE
,
moore
)
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -41,8 +51,14 @@ __C infiniStatus_t infiniopGetI8GemmWorkspaceSize(infiniopI8GemmDescriptor_t des
case CASE: \
*size = reinterpret_cast<op::i8gemm::NAMESPACE::Descriptor *>(desc)->minWorkspaceSize(); \
return INFINI_STATUS_SUCCESS;
#if defined(ENABLE_NVIDIA_API)
&& defined(ENABLE_CUTLASS_API)
#if defined(ENABLE_NVIDIA_API)
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
)
#endif
#if defined(ENABLE_QY_API)
GET
(
INFINI_DEVICE_QY
,
nvidia
)
#endif
#if defined(ENABLE_MOORE_API)
GET
(
INFINI_DEVICE_MOORE
,
moore
)
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -65,8 +81,14 @@ __C infiniStatus_t infiniopI8Gemm(infiniopI8GemmDescriptor_t desc,
return reinterpret_cast<op::i8gemm::NAMESPACE::Descriptor *>(desc)->calculate( \
workspace, workspace_size, out, bias, a, a_scale, b, b_scale, stream);
switch
(
desc
->
device_type
)
{
#if defined(ENABLE_NVIDIA_API)
&& defined(ENABLE_CUTLASS_API)
#if defined(ENABLE_NVIDIA_API)
CACULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
)
#endif
#if defined(ENABLE_QY_API)
CACULATE
(
INFINI_DEVICE_QY
,
nvidia
)
#endif
#if defined(ENABLE_MOORE_API)
CACULATE
(
INFINI_DEVICE_MOORE
,
moore
)
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -80,8 +102,14 @@ __C infiniStatus_t infiniopDestroyI8GemmDescriptor(infiniopI8GemmDescriptor_t de
delete reinterpret_cast<op::i8gemm::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#if defined(ENABLE_NVIDIA_API)
&& defined(ENABLE_CUTLASS_API)
#if defined(ENABLE_NVIDIA_API)
DESTROY
(
INFINI_DEVICE_NVIDIA
,
nvidia
)
#endif
#if defined(ENABLE_QY_API)
DESTROY
(
INFINI_DEVICE_QY
,
nvidia
)
#endif
#if defined(ENABLE_MOORE_API)
DESTROY
(
INFINI_DEVICE_MOORE
,
moore
)
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
src/infiniop/ops/sigmoid/operator.cc
View file @
784139b9
...
...
@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/sigmoid_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
|| defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API)
#include "nvidia/sigmoid_nvidia.cuh"
#endif
...
...
@@ -34,6 +34,12 @@ __C infiniStatus_t infiniopCreateSigmoidDescriptor(
#ifdef ENABLE_QY_API
CREATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
CREATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -58,6 +64,12 @@ __C infiniStatus_t infiniopGetSigmoidWorkspaceSize(infiniopSigmoidDescriptor_t d
#endif
#ifdef ENABLE_QY_API
GET
(
INFINI_DEVICE_QY
,
nvidia
)
#endif
#ifdef ENABLE_ALI_API
GET
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
GET
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -91,7 +103,12 @@ __C infiniStatus_t infiniopSigmoid(
#ifdef ENABLE_QY_API
CALCULATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
CALCULATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
...
...
@@ -118,7 +135,12 @@ infiniopDestroySigmoidDescriptor(infiniopSigmoidDescriptor_t desc) {
#ifdef ENABLE_QY_API
DELETE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
DELETE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
...
...
src/infiniop/ops/silu/operator.cc
View file @
784139b9
...
...
@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/silu_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
|| defined(ENABLE_ALI_API)
#include "nvidia/silu_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
...
...
@@ -46,6 +46,9 @@ __C infiniStatus_t infiniopCreateSiluDescriptor(
#ifdef ENABLE_MOORE_API
CREATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
#ifdef ENABLE_ALI_API
CREATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -77,6 +80,10 @@ __C infiniStatus_t infiniopGetSiluWorkspaceSize(infiniopSiluDescriptor_t desc, s
#ifdef ENABLE_MOORE_API
GET
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
#ifdef ENABLE_ALI_API
GET
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
...
...
@@ -115,6 +122,9 @@ __C infiniStatus_t infiniopSilu(
#ifdef ENABLE_MOORE_API
CALCULATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
#ifdef ENABLE_ALI_API
CALCULATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -148,6 +158,9 @@ infiniopDestroySiluDescriptor(infiniopSiluDescriptor_t desc) {
#ifdef ENABLE_MOORE_API
DELETE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
#ifdef ENABLE_ALI_API
DELETE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
src/infiniop/ops/silu_and_mul/info.h
0 → 100644
View file @
784139b9
#ifndef __SILU_AND_MUL_INFO_H__
#define __SILU_AND_MUL_INFO_H__
#include "../../../utils.h"
#include "../../tensor.h"
#include <vector>
namespace
op
::
silu_and_mul
{
class
SiluAndMulInfo
{
SiluAndMulInfo
()
=
default
;
public:
infiniDtype_t
dtype
;
size_t
batch_size
;
size_t
out_hidden_dim
;
static
utils
::
Result
<
SiluAndMulInfo
>
create
(
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
)
{
auto
dtype
=
y_desc
->
dtype
();
auto
x_shape
=
x_desc
->
shape
();
auto
y_shape
=
y_desc
->
shape
();
auto
ndim
=
x_desc
->
ndim
();
if
(
ndim
!=
y_desc
->
ndim
())
{
return
INFINI_STATUS_BAD_PARAM
;
}
if
(
x_shape
[
ndim
-
1
]
!=
2
*
y_shape
[
ndim
-
1
])
{
return
INFINI_STATUS_BAD_TENSOR_SHAPE
;
}
size_t
batch
=
1
;
for
(
int
i
=
0
;
i
<
(
int
)
ndim
-
1
;
++
i
)
{
if
(
x_shape
[
i
]
!=
y_shape
[
i
])
{
return
INFINI_STATUS_BAD_TENSOR_SHAPE
;
}
batch
*=
y_shape
[
i
];
}
return
utils
::
Result
<
SiluAndMulInfo
>
(
SiluAndMulInfo
{
dtype
,
batch
,
y_shape
[
ndim
-
1
]});
}
private:
SiluAndMulInfo
(
infiniDtype_t
dtype
,
size_t
batch
,
size_t
hidden
)
:
dtype
(
dtype
),
batch_size
(
batch
),
out_hidden_dim
(
hidden
)
{}
};
}
// namespace op::silu_and_mul
#endif // __SILU_AND_MUL_INFO_H__
src/infiniop/ops/silu_and_mul/moore/silu_and_mul_moore.h
0 → 100644
View file @
784139b9
#ifndef __SILU_ADN_MUL_MOORE_API_H__
#define __SILU_ADN_MUL_MOORE_API_H__
#include "../silu_and_mul.h"
DESCRIPTOR
(
moore
)
#endif // __SILU_ADN_MUL_MOORE_API_H__
src/infiniop/ops/silu_and_mul/moore/silu_and_mul_moore.mu
0 → 100644
View file @
784139b9
#include "../../../devices/moore/moore_common.h"
#include "../../../devices/moore/moore_handle.h"
#include "silu_and_mul_moore.h"
#include <musa_bf16.h>
#include <memory>
namespace op::silu_and_mul::moore {
struct Descriptor::Opaque {
std::shared_ptr<device::moore::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc) {
if (!desc_ptr) {
return INFINI_STATUS_BAD_PARAM;
}
auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
auto dtype = y_desc->dtype();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
if (x_desc->dtype() != dtype) {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
auto result = SiluAndMulInfo::create(y_desc, x_desc);
CHECK_RESULT(result);
auto info = result.take();
*desc_ptr = new Descriptor(
new Opaque{handle->internal()},
std::move(info),
0,
handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <typename T>
infiniStatus_t calculate_impl(
const SiluAndMulInfo &info,
std::shared_ptr<device::moore::Handle::Internal> &internal,
void *y,
const void *x,
void *stream) {
return internal->useMudnn(
(musaStream_t)stream,
[&](::musa::dnn::Handle &mudnn_handle) -> infiniStatus_t {
::musa::dnn::Tensor x_t, y_t;
if constexpr (std::is_same_v<T, half>) {
x_t.SetType(::musa::dnn::Tensor::Type::HALF);
y_t.SetType(::musa::dnn::Tensor::Type::HALF);
} else if constexpr (std::is_same_v<T, __mt_bfloat16>) {
x_t.SetType(::musa::dnn::Tensor::Type::BFLOAT16);
y_t.SetType(::musa::dnn::Tensor::Type::BFLOAT16);
} else {
x_t.SetType(::musa::dnn::Tensor::Type::FLOAT);
y_t.SetType(::musa::dnn::Tensor::Type::FLOAT);
}
x_t.SetAddr(const_cast<void *>(x));
y_t.SetAddr(y);
// --- Construct 2D dimension information ---
// Explicitly distinguish between Batch and Hidden dimensions
int64_t b = static_cast<int64_t>(info.batch_size);
int64_t h = static_cast<int64_t>(info.out_hidden_dim);
// Input x logical shape is [batch, 2 * hidden]
std::array<int64_t, 2> x_dims = {b, h * 2};
std::array<int64_t, 2> x_strides = {h * 2, 1};
// Output y logical shape is [batch, hidden]
std::array<int64_t, 2> y_dims = {b, h};
std::array<int64_t, 2> y_strides = {h, 1};
x_t.SetNdInfo(2, x_dims.data(), x_strides.data());
y_t.SetNdInfo(2, y_dims.data(), y_strides.data());
// Invoke muDNN SwiGLU
// muDNN will split each row (length 2*h) internally,
// muDNN treats the first h elements of input x as the 'gate'
// and the following h elements as the 'up' projection.
::musa::dnn::SwiGlu swiglu;
swiglu.Run(mudnn_handle, y_t, x_t);
return INFINI_STATUS_SUCCESS;
});
}
infiniStatus_t Descriptor::calculate(
void *workspace, size_t workspace_size,
void *y, const void *x,
void *stream) const {
infiniDtype_t dtype = _info.dtype;
switch (dtype) {
case INFINI_DTYPE_F16:
return calculate_impl<half>(_info, _opaque->internal, y, x, stream);
case INFINI_DTYPE_F32:
return calculate_impl<float>(_info, _opaque->internal, y, x, stream);
case INFINI_DTYPE_BF16:
return calculate_impl<__mt_bfloat16>(_info, _opaque->internal, y, x, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
} // namespace op::silu_and_mul::moore
src/infiniop/ops/silu_and_mul/operator.cc
0 → 100644
View file @
784139b9
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/silu_and_mul.h"
#ifdef ENABLE_MOORE_API
#include "moore/silu_and_mul_moore.h"
#endif
__C
infiniStatus_t
infiniopCreateSiluAndMulDescriptor
(
infiniopHandle_t
handle
,
infiniopSiluAndMulDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
)
{
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::silu_and_mul::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::silu_and_mul::NAMESPACE::Descriptor **>(desc_ptr), \
y_desc, \
x_desc);
switch
(
handle
->
device
)
{
#ifdef ENABLE_MOORE_API
CREATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
}
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
__C
infiniStatus_t
infiniopGetSiluAndMulWorkspaceSize
(
infiniopSiluAndMulDescriptor_t
desc
,
size_t
*
size
)
{
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<const op::silu_and_mul::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_MOORE_API
GET
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
}
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
__C
infiniStatus_t
infiniopSiluAndMul
(
infiniopSiluAndMulDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
void
*
stream
)
{
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::silu_and_mul::NAMESPACE::Descriptor *>(desc)->calculate( \
workspace, workspace_size, y, x, stream);
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_MOORE_API
CALCULATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
}
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
__C
infiniStatus_t
infiniopDestroySiluAndMulDescriptor
(
infiniopSiluAndMulDescriptor_t
desc
)
{
#define DESTROY(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::silu_and_mul::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_MOORE_API
DESTROY
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
}
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
src/infiniop/ops/silu_and_mul/silu_and_mul.h
0 → 100644
View file @
784139b9
#ifndef SILU_AND_MUL_H
#define SILU_AND_MUL_H
#include "../../operator.h"
#include "info.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::silu_and_mul::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
SiluAndMulInfo _info; \
size_t _workspace_size; \
\
Descriptor( \
Opaque *opaque, \
SiluAndMulInfo info, \
size_t workspace_size, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size) {} \
\
public: \
~Descriptor(); \
\
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t y_desc, \
infiniopTensorDescriptor_t x_desc); \
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
void *y, \
const void *x, \
void *stream) const; \
}; \
}
#endif // SILU_AND_MUL_H
src/infiniop/ops/softmax/nvidia/softmax_nvidia.cu
View file @
784139b9
...
...
@@ -128,6 +128,9 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_4096
)
{
CHECK_STATUS
(
launchKernel
<
CUDA_BLOCK_SIZE_4096
>
(
y
,
x
,
_info
.
dtype
,
_info
.
othersize
,
_info
.
dimsize
,
_info
.
stride
,
stream
));
}
else
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
==
CUDA_BLOCK_SIZE_2048
)
{
CHECK_STATUS
(
launchKernel
<
CUDA_BLOCK_SIZE_2048
>
(
y
,
x
,
_info
.
dtype
,
_info
.
othersize
,
_info
.
dimsize
,
_info
.
stride
,
stream
));
}
else
{
return
INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED
;
}
...
...
src/infiniop/ops/softmax/operator.cc
View file @
784139b9
...
...
@@ -2,7 +2,7 @@
#include "../../handle.h"
#include "infiniop/ops/softmax.h"
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
|| defined(ENABLE_ALI_API)
#include "nvidia/softmax_nvidia.cuh"
#endif
...
...
@@ -33,6 +33,9 @@ __C infiniStatus_t infiniopCreateSoftmaxDescriptor(
#endif
#ifdef ENABLE_HYGON_API
CREATE
(
INFINI_DEVICE_HYGON
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
CREATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
}
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -57,6 +60,9 @@ __C infiniStatus_t infiniopGetSoftmaxWorkspaceSize(infiniopSoftmaxDescriptor_t d
#endif
#ifdef ENABLE_HYGON_API
GET
(
INFINI_DEVICE_HYGON
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
GET
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
}
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -86,6 +92,9 @@ __C infiniStatus_t infiniopSoftmax(
#endif
#ifdef ENABLE_HYGON_API
CALCULATE
(
INFINI_DEVICE_HYGON
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
CALCULATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
}
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -110,6 +119,9 @@ __C infiniStatus_t infiniopDestroySoftmaxDescriptor(infiniopSoftmaxDescriptor_t
#endif
#ifdef ENABLE_HYGON_API
DESTROY
(
INFINI_DEVICE_HYGON
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
DESTROY
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
}
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
src/infiniop/ops/softplus/operator.cc
View file @
784139b9
...
...
@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/softplus_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
|| defined(ENABLE_ALI_API)
#include "nvidia/softplus_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
...
...
@@ -49,6 +49,10 @@ __C infiniStatus_t infiniopCreateSoftplusDescriptor(
#ifdef ENABLE_KUNLUN_API
CREATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_ALI_API
CREATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
...
...
@@ -82,6 +86,10 @@ __C infiniStatus_t infiniopGetSoftplusWorkspaceSize(infiniopSoftplusDescriptor_t
#ifdef ENABLE_KUNLUN_API
GET
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_ALI_API
GET
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
...
...
@@ -123,6 +131,10 @@ __C infiniStatus_t infiniopSoftplus(
#ifdef ENABLE_KUNLUN_API
CALCULATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_ALI_API
CALCULATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
...
...
@@ -158,6 +170,10 @@ infiniopDestroySoftplusDescriptor(infiniopSoftplusDescriptor_t desc) {
#ifdef ENABLE_KUNLUN_API
DELETE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_ALI_API
DELETE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
...
...
src/infiniop/ops/sub/operator.cc
View file @
784139b9
...
...
@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/sub_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
|| defined(ENABLE_ALI_API)
#include "nvidia/sub_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
...
...
@@ -51,6 +51,9 @@ __C infiniStatus_t infiniopCreateSubDescriptor(
#ifdef ENABLE_KUNLUN_API
CREATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_ALI_API
CREATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -85,6 +88,9 @@ __C infiniStatus_t infiniopGetSubWorkspaceSize(infiniopSubDescriptor_t desc, siz
#ifdef ENABLE_KUNLUN_API
GET
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_ALI_API
GET
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -128,6 +134,9 @@ __C infiniStatus_t infiniopSub(
#ifdef ENABLE_KUNLUN_API
CALCULATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_ALI_API
CALCULATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -164,6 +173,9 @@ infiniopDestroySubDescriptor(infiniopSubDescriptor_t desc) {
#ifdef ENABLE_KUNLUN_API
DELETE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_ALI_API
DELETE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
src/infiniop/ops/swiglu/ninetoothed/build.py
0 → 100644
View file @
784139b9
import
ninetoothed
from
.
import
swiglu
import
infiniop.ninetoothed.build
def
build
():
MAX_NDIM
=
5
ndim_values
=
range
(
1
,
MAX_NDIM
+
1
)
dtype_values
=
(
ninetoothed
.
float16
,
ninetoothed
.
bfloat16
,
ninetoothed
.
float32
,
)
constexpr_param_grid
=
{
"ndim"
:
ndim_values
,
"dtype"
:
dtype_values
,
"block_size"
:
(
1024
,),
}
infiniop
.
ninetoothed
.
build
.
build
(
swiglu
.
premake
,
constexpr_param_grid
,
caller
=
"cuda"
,
op_name
=
"swiglu"
,
output_dir
=
infiniop
.
ninetoothed
.
build
.
BUILD_DIRECTORY_PATH
,
)
src/infiniop/ops/swiglu/ninetoothed/swiglu.h
0 → 100644
View file @
784139b9
#ifndef SWIGLU_H
#define SWIGLU_H
#include "../../../handle.h"
#include "../../../operator.h"
#include "../../../tensor.h"
#include "../../../../../build/ninetoothed/swiglu.h"
#include "../../../ninetoothed/utils.h"
namespace
op
::
swiglu
::
ninetoothed
{
class
Descriptor
final
:
public
InfiniopDescriptor
{
public:
Descriptor
(
infiniopHandle_t
handle
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
)
:
InfiniopDescriptor
{
handle
->
device
,
handle
->
device_id
},
out_shape_
{
out_desc
->
shape
()},
out_strides_
{
out_desc
->
strides
()},
up_shape_
{
input_desc_vec
[
0
]
->
shape
()},
up_strides_
{
input_desc_vec
[
0
]
->
strides
()},
gate_shape_
{
input_desc_vec
[
1
]
->
shape
()},
gate_strides_
{
input_desc_vec
[
1
]
->
strides
()},
dtype_
{
out_desc
->
dtype
()}
{}
~
Descriptor
()
=
default
;
size_t
workspaceSize
()
const
{
return
0
;
}
static
infiniStatus_t
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
)
{
*
desc_ptr
=
new
Descriptor
(
handle
,
out_desc
,
input_desc_vec
);
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
{
auto
out_nt
{
::
ninetoothed
::
Tensor
(
output
,
out_shape_
,
out_strides_
)};
auto
up_nt
{
::
ninetoothed
::
Tensor
(
inputs
[
0
],
up_shape_
,
up_strides_
)};
auto
gate_nt
{
::
ninetoothed
::
Tensor
(
inputs
[
1
],
gate_shape_
,
gate_strides_
)};
if
(
launch_swiglu
(
stream
,
out_nt
,
up_nt
,
gate_nt
,
out_shape_
.
size
(),
dtype_
,
1024
))
{
return
INFINI_STATUS_NOT_IMPLEMENTED
;
}
return
INFINI_STATUS_SUCCESS
;
}
private:
using
Size
=
::
ninetoothed
::
Tensor
<>::
Size
;
using
Stride
=
::
ninetoothed
::
Tensor
<>::
Stride
;
std
::
vector
<
Size
>
out_shape_
;
std
::
vector
<
Stride
>
out_strides_
;
std
::
vector
<
Size
>
up_shape_
;
std
::
vector
<
Stride
>
up_strides_
;
std
::
vector
<
Size
>
gate_shape_
;
std
::
vector
<
Stride
>
gate_strides_
;
infiniDtype_t
dtype_
;
};
}
// namespace op::swiglu::ninetoothed
#endif // SWIGLU_H
src/infiniop/ops/swiglu/ninetoothed/swiglu.py
0 → 100644
View file @
784139b9
import
functools
import
ninetoothed.language
as
ntl
from
ninetoothed
import
Tensor
from
ntops.kernels.element_wise
import
arrangement
def
application
(
output
,
up
,
gate
):
output
=
ntl
.
sigmoid
(
ntl
.
cast
(
gate
,
ntl
.
float32
))
*
gate
*
up
# noqa: F841
def
premake
(
ndim
,
dtype
=
None
,
block_size
=
None
):
arrangement_
=
functools
.
partial
(
arrangement
,
block_size
=
block_size
)
tensors
=
(
Tensor
(
ndim
,
dtype
=
dtype
),
Tensor
(
ndim
,
dtype
=
dtype
),
Tensor
(
ndim
,
dtype
=
dtype
),
)
return
arrangement_
,
application
,
tensors
src/infiniop/ops/swiglu/operator.cc
View file @
784139b9
...
...
@@ -5,15 +5,23 @@
#ifdef ENABLE_CPU_API
#include "cpu/swiglu_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_ALI_API)
#if defined(ENABLE_NINETOOTHED)
#include "ninetoothed/swiglu.h"
#else
#include "nvidia/swiglu_nvidia.cuh"
#endif
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/swiglu_kunlun.h"
#endif
#ifdef ENABLE_METAX_API
#if defined(ENABLE_NINETOOTHED)
#include "ninetoothed/swiglu.h"
#else
#include "metax/swiglu_metax.h"
#endif
#endif
#ifdef ENABLE_CAMBRICON_API
#include "bang/swiglu_bang.h"
#endif
...
...
@@ -46,11 +54,22 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
CREATE
(
INFINI_DEVICE_NVIDIA
,
ninetoothed
);
#else
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#endif
#ifdef ENABLE_ILUVATAR_API
#ifdef ENABLE_NINETOOTHED
CREATE
(
INFINI_DEVICE_ILUVATAR
,
ninetoothed
);
#else
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#endif
#ifdef ENABLE_ALI_API
CREATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
CREATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
...
...
@@ -61,8 +80,12 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
CREATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_NINETOOTHED
CREATE
(
INFINI_DEVICE_METAX
,
ninetoothed
);
#else
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#endif
#ifdef ENABLE_CAMBRICON_API
CREATE
(
INFINI_DEVICE_CAMBRICON
,
bang
);
#endif
...
...
@@ -92,11 +115,22 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
GET
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
GET
(
INFINI_DEVICE_NVIDIA
,
ninetoothed
);
#else
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#endif
#ifdef ENABLE_ILUVATAR_API
#ifdef ENABLE_NINETOOTHED
GET
(
INFINI_DEVICE_ILUVATAR
,
ninetoothed
);
#else
GET
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#endif
#ifdef ENABLE_ALI_API
GET
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
GET
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
...
...
@@ -107,8 +141,12 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
GET
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_NINETOOTHED
GET
(
INFINI_DEVICE_METAX
,
ninetoothed
);
#else
GET
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#endif
#ifdef ENABLE_CAMBRICON_API
GET
(
INFINI_DEVICE_CAMBRICON
,
bang
);
#endif
...
...
@@ -145,11 +183,22 @@ __C infiniStatus_t infiniopSwiGLU(
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
ninetoothed
);
#else
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#endif
#ifdef ENABLE_ILUVATAR_API
#ifdef ENABLE_NINETOOTHED
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
ninetoothed
);
#else
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#endif
#ifdef ENABLE_ALI_API
CALCULATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
CALCULATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
...
...
@@ -160,8 +209,12 @@ __C infiniStatus_t infiniopSwiGLU(
CALCULATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_NINETOOTHED
CALCULATE
(
INFINI_DEVICE_METAX
,
ninetoothed
);
#else
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#endif
#ifdef ENABLE_CAMBRICON_API
CALCULATE
(
INFINI_DEVICE_CAMBRICON
,
bang
);
#endif
...
...
@@ -193,11 +246,22 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NINETOOTHED
DELETE
(
INFINI_DEVICE_NVIDIA
,
ninetoothed
);
#else
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#endif
#ifdef ENABLE_ILUVATAR_API
#ifdef ENABLE_NINETOOTHED
DELETE
(
INFINI_DEVICE_ILUVATAR
,
ninetoothed
);
#else
DELETE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#endif
#ifdef ENABLE_ALI_API
DELETE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
DELETE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
...
...
@@ -208,8 +272,12 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
DELETE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_METAX_API
#ifdef ENABLE_NINETOOTHED
DELETE
(
INFINI_DEVICE_METAX
,
ninetoothed
);
#else
DELETE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#endif
#ifdef ENABLE_CAMBRICON_API
DELETE
(
INFINI_DEVICE_CAMBRICON
,
bang
);
#endif
...
...
src/infiniop/ops/tanh/operator.cc
View file @
784139b9
...
...
@@ -5,7 +5,7 @@
#ifdef ENABLE_CPU_API
#include "cpu/tanh_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
|| defined(ENABLE_ALI_API)
#include "nvidia/tanh_nvidia.cuh"
#endif
// #ifdef ENABLE_METAX_API
...
...
@@ -40,6 +40,10 @@ __C infiniStatus_t infiniopCreateTanhDescriptor(
#ifdef ENABLE_QY_API
CREATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
CREATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
// #ifdef ENABLE_METAX_API
// CREATE(INFINI_DEVICE_METAX, metax);
// #endif
...
...
@@ -71,6 +75,10 @@ __C infiniStatus_t infiniopGetTanhWorkspaceSize(infiniopTanhDescriptor_t desc, s
#ifdef ENABLE_QY_API
GET
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
GET
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
// #ifdef ENABLE_METAX_API
// GET(INFINI_DEVICE_METAX, metax);
// #endif
...
...
@@ -109,6 +117,10 @@ __C infiniStatus_t infiniopTanh(
#ifdef ENABLE_QY_API
CALCULATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
CALCULATE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
// #ifdef ENABLE_METAX_API
// CALCULATE(INFINI_DEVICE_METAX, metax);
// #endif
...
...
@@ -142,6 +154,10 @@ infiniopDestroyTanhDescriptor(infiniopTanhDescriptor_t desc) {
#ifdef ENABLE_QY_API
DELETE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_ALI_API
DELETE
(
INFINI_DEVICE_ALI
,
nvidia
);
#endif
// #ifdef ENABLE_METAX_API
// DELETE(INFINI_DEVICE_METAX, metax);
// #endif
...
...
src/infiniop/ops/topkrouter/cuda/kernel.cuh
View file @
784139b9
...
...
@@ -27,6 +27,15 @@ struct CustomLess {
}
};
// Warp-level sum reduction for Hygon platform
template
<
int
warp_threads
>
__inline__
__device__
float
WarpSum
(
float
val
)
{
for
(
int
mask
=
warp_threads
/
2
;
mask
>
0
;
mask
/=
2
)
{
val
+=
__shfl_xor_sync
(
0xffffffff
,
val
,
mask
);
}
return
val
;
}
template
<
typename
T
,
int
BLOCK_THREADS
=
256
>
__global__
void
topkrouter_kernel
(
float
*
values_topk
,
// 输出数据, 形状[N, topk]
int
*
indices_topk
,
// 输出索引, 形状[N, topk]
...
...
@@ -137,12 +146,19 @@ __global__ void topkrouter_kernel(float *values_topk, // 输出数
value
=
sigmoid_func
(
data_input
[
index
]);
}
{
#ifdef ENABLE_HYGON_API
float
warp_sum
=
WarpSum
<
warp_threads
>
(
value
);
if
(
0
==
tid
)
{
share_sum
=
warp_sum
+
1e-9
f
;
}
#else
typedef
cub
::
WarpReduce
<
float
,
warp_threads
>
WarpReduce
;
__shared__
typename
WarpReduce
::
TempStorage
temp_storage
;
float
warp_sum
=
WarpReduce
(
temp_storage
).
Sum
(
value
);
if
(
0
==
tid
)
{
share_sum
=
warp_sum
+
1e-9
f
;
}
#endif
}
__syncwarp
();
...
...
Prev
1
…
9
10
11
12
13
14
15
16
17
…
30
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment