Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
f8e9d924
Unverified
Commit
f8e9d924
authored
Aug 11, 2025
by
PanZezhong1725
Committed by
GitHub
Aug 11, 2025
Browse files
Merge pull request #352 from InfiniTensor/p800-sub
issue/349 P800 elementwise sub mul clip
parents
60ca4508
94e24c8d
Changes
13
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
353 additions
and
3 deletions
+353
-3
src/infiniop/elementwise/kunlun/elementwise_kunlun.h
src/infiniop/elementwise/kunlun/elementwise_kunlun.h
+0
-3
src/infiniop/ops/clip/kunlun/clip_kunlun.h
src/infiniop/ops/clip/kunlun/clip_kunlun.h
+8
-0
src/infiniop/ops/clip/kunlun/clip_kunlun.xpu
src/infiniop/ops/clip/kunlun/clip_kunlun.xpu
+69
-0
src/infiniop/ops/clip/kunlun/kernel.h
src/infiniop/ops/clip/kunlun/kernel.h
+30
-0
src/infiniop/ops/clip/operator.cc
src/infiniop/ops/clip/operator.cc
+15
-0
src/infiniop/ops/mul/kunlun/kernel.h
src/infiniop/ops/mul/kunlun/kernel.h
+25
-0
src/infiniop/ops/mul/kunlun/mul_kunlun.h
src/infiniop/ops/mul/kunlun/mul_kunlun.h
+8
-0
src/infiniop/ops/mul/kunlun/mul_kunlun.xpu
src/infiniop/ops/mul/kunlun/mul_kunlun.xpu
+67
-0
src/infiniop/ops/mul/operator.cc
src/infiniop/ops/mul/operator.cc
+15
-0
src/infiniop/ops/sub/kunlun/kernel.h
src/infiniop/ops/sub/kunlun/kernel.h
+25
-0
src/infiniop/ops/sub/kunlun/sub_kunlun.h
src/infiniop/ops/sub/kunlun/sub_kunlun.h
+8
-0
src/infiniop/ops/sub/kunlun/sub_kunlun.xpu
src/infiniop/ops/sub/kunlun/sub_kunlun.xpu
+67
-0
src/infiniop/ops/sub/operator.cc
src/infiniop/ops/sub/operator.cc
+16
-0
No files found.
src/infiniop/elementwise/kunlun/elementwise_kunlun.h
View file @
f8e9d924
...
...
@@ -227,9 +227,6 @@ private:
CHECK_KUNLUN
(
xpu_memcpy_async
(
workspace
,
h_inputs_arr
,
input_arr_size
,
XPU_HOST_TO_DEVICE
,
stream
));
CHECK_KUNLUN
(
xpu_memcpy_async
((
void
*
)
d_meta_start
,
info_meta_start
,
info
.
getMetaMemSize
(),
XPU_HOST_TO_DEVICE
,
stream
));
xpu_wait
(
stream
);
// xpu_wait(stream);
// offset/assign the pointers
d_inputs_arr
=
reinterpret_cast
<
__global_ptr__
const
void
**>
(
workspace
);
d_output_shape
=
reinterpret_cast
<
__global_ptr__
const
size_t
*>
(
d_meta_start
);
...
...
src/infiniop/ops/clip/kunlun/clip_kunlun.h
0 → 100644
View file @
f8e9d924
#ifndef __CLIP_KUNLUN_API_H__
#define __CLIP_KUNLUN_API_H__
#include "../../../elementwise/kunlun/elementwise_kunlun_api.h"
ELEMENTWISE_DESCRIPTOR
(
clip
,
kunlun
)
#endif // __CLIP_KUNLUN_API_H__
src/infiniop/ops/clip/kunlun/clip_kunlun.xpu
0 → 100644
View file @
f8e9d924
#include "../../../elementwise/kunlun/elementwise_kunlun.h"
#include "clip_kunlun.h"
#include "kernel.h"
namespace op::elementwise::kunlun {
using ClipOp = op::clip::kunlun::ClipOp;
INSTANTIATE_ELEMENTWISE_KERNEL(ClipOp::num_inputs, ClipOp, float);
INSTANTIATE_ELEMENTWISE_KERNEL(ClipOp::num_inputs, ClipOp, half);
INSTANTIATE_ELEMENTWISE_KERNEL(ClipOp::num_inputs, ClipOp, bfloat16_t);
} // namespace op::elementwise::kunlun
namespace op::clip::kunlun {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::kunlun::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &in_desc = input_desc_vec.at(0);
const auto &min_desc = input_desc_vec.at(1);
const auto &max_desc = input_desc_vec.at(2);
const auto &out_shape = out_desc->shape();
const auto &in_shape = in_desc->shape();
const auto &min_shape = min_desc->shape();
const auto &max_shape = max_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(out_shape, in_shape);
CHECK_SAME_SHAPE(out_shape, min_shape);
CHECK_SAME_SHAPE(out_shape, max_shape);
CREATE_ELEMENTWISE_KUNLUN_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<8, ClipOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<8, ClipOp, bfloat16_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<8, ClipOp, float>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::clip::kunlun
src/infiniop/ops/clip/kunlun/kernel.h
0 → 100644
View file @
f8e9d924
#ifndef __CLIP_KUNLUN_KERNEL_H__
#define __CLIP_KUNLUN_KERNEL_H__
#include <xpu/kernel/xtdk_io.h>
namespace
op
::
clip
::
kunlun
{
typedef
struct
ClipOp
{
public:
static
constexpr
int
num_inputs
=
3
;
template
<
typename
T
>
inline
__device__
T
operator
()(
const
T
*
inputs
)
const
{
T
x
=
inputs
[
0
];
T
min_val
=
inputs
[
1
];
T
max_val
=
inputs
[
2
];
return
fmax
(
fmin
(
x
,
max_val
),
min_val
);
}
// bfloat16 特化版本(使用 float 计算精度)
inline
__device__
bfloat16_t
operator
()(
const
bfloat16_t
*
inputs
)
const
{
float
x_f
=
__bfloat162float
(
inputs
[
0
]);
float
min_val_f
=
__bfloat162float
(
inputs
[
1
]);
float
max_val_f
=
__bfloat162float
(
inputs
[
2
]);
float
result_f
=
fmax
(
fmin
(
x_f
,
max_val_f
),
min_val_f
);
return
__float2bfloat16
(
result_f
);
}
}
ClipOp
;
}
// namespace op::clip::kunlun
#endif // __CLIP_KUNLUN_KERNEL_H__
src/infiniop/ops/clip/operator.cc
View file @
f8e9d924
...
...
@@ -11,6 +11,9 @@
#ifdef ENABLE_METAX_API
#include "metax/clip_metax.h"
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/clip_kunlun.h"
#endif
__C
infiniStatus_t
infiniopCreateClipDescriptor
(
infiniopHandle_t
handle
,
...
...
@@ -42,6 +45,9 @@ __C infiniStatus_t infiniopCreateClipDescriptor(
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
CREATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -69,6 +75,9 @@ __C infiniStatus_t infiniopGetClipWorkspaceSize(infiniopClipDescriptor_t desc, s
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
metax
)
#endif
#ifdef ENABLE_KUNLUN_API
GET
(
INFINI_DEVICE_KUNLUN
,
kunlun
)
#endif
}
...
...
@@ -106,6 +115,9 @@ __C infiniStatus_t infiniopClip(
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -136,6 +148,9 @@ infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc) {
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
DELETE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
src/infiniop/ops/mul/kunlun/kernel.h
0 → 100644
View file @
f8e9d924
#ifndef __MUL_KUNLUN_KERNEL_H__
#define __MUL_KUNLUN_KERNEL_H__
namespace
op
::
mul
::
kunlun
{
typedef
struct
MulOp
{
public:
static
constexpr
int
num_inputs
=
2
;
template
<
typename
T
>
inline
__device__
T
operator
()(
const
T
*
inputs
)
const
{
T
a
=
inputs
[
0
];
T
b
=
inputs
[
1
];
return
a
*
b
;
}
// bfloat16 特化版本(使用 float 计算精度)
inline
__device__
bfloat16_t
operator
()(
const
bfloat16_t
*
inputs
)
const
{
float
a_f
=
__bfloat162float
(
inputs
[
0
]);
float
b_f
=
__bfloat162float
(
inputs
[
1
]);
return
__float2bfloat16
(
a_f
*
b_f
);
}
}
MulOp
;
}
// namespace op::mul::kunlun
#endif // __MUL_KUNLUN_KERNEL_H__
src/infiniop/ops/mul/kunlun/mul_kunlun.h
0 → 100644
View file @
f8e9d924
#ifndef __MUL_KUNLUN_API_H__
#define __MUL_KUNLUN_API_H__
#include "../../../elementwise/kunlun/elementwise_kunlun_api.h"
ELEMENTWISE_DESCRIPTOR
(
mul
,
kunlun
)
#endif // __MUL_KUNLUN_API_H__
src/infiniop/ops/mul/kunlun/mul_kunlun.xpu
0 → 100644
View file @
f8e9d924
#include "../../../elementwise/kunlun/elementwise_kunlun.h"
#include "kernel.h"
#include "mul_kunlun.h"
namespace op::elementwise::kunlun {
using MulOp = op::mul::kunlun::MulOp;
INSTANTIATE_ELEMENTWISE_KERNEL(MulOp::num_inputs, MulOp, float);
INSTANTIATE_ELEMENTWISE_KERNEL(MulOp::num_inputs, MulOp, half);
INSTANTIATE_ELEMENTWISE_KERNEL(MulOp::num_inputs, MulOp, bfloat16_t);
} // namespace op::elementwise::kunlun
namespace op::mul::kunlun {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::kunlun::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &a_desc = input_desc_vec.at(0);
const auto &b_desc = input_desc_vec.at(1);
const auto &c_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
const auto &b_shape = b_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
// create KUNLUN elementwise descriptor
CREATE_ELEMENTWISE_KUNLUN_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<8, MulOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<8, MulOp, bfloat16_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<8, MulOp, float>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::mul::kunlun
src/infiniop/ops/mul/operator.cc
View file @
f8e9d924
...
...
@@ -11,6 +11,9 @@
#ifdef ENABLE_METAX_API
#include "metax/mul_metax.h"
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/mul_kunlun.h"
#endif
__C
infiniStatus_t
infiniopCreateMulDescriptor
(
infiniopHandle_t
handle
,
...
...
@@ -42,6 +45,9 @@ __C infiniStatus_t infiniopCreateMulDescriptor(
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
CREATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -70,6 +76,9 @@ __C infiniStatus_t infiniopGetMulWorkspaceSize(infiniopMulDescriptor_t desc, siz
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
GET
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -107,6 +116,9 @@ __C infiniStatus_t infiniopMul(
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -137,6 +149,9 @@ infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc) {
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
DELETE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
src/infiniop/ops/sub/kunlun/kernel.h
0 → 100644
View file @
f8e9d924
#ifndef __SUB_KUNLUN_KERNEL_H__
#define __SUB_KUNLUN_KERNEL_H__
namespace
op
::
sub
::
kunlun
{
typedef
struct
SubOp
{
public:
static
constexpr
int
num_inputs
=
2
;
template
<
typename
T
>
inline
__device__
T
operator
()(
const
T
*
inputs
)
const
{
T
a
=
inputs
[
0
];
T
b
=
inputs
[
1
];
return
a
-
b
;
}
// bfloat16 特化版本(使用 float 计算精度)
inline
__device__
bfloat16_t
operator
()(
const
bfloat16_t
*
inputs
)
const
{
float
a_f
=
__bfloat162float
(
inputs
[
0
]);
float
b_f
=
__bfloat162float
(
inputs
[
1
]);
return
__float2bfloat16
(
a_f
-
b_f
);
}
}
SubOp
;
}
// namespace op::sub::kunlun
#endif // __SUB_KUNLUN_KERNEL_H__
src/infiniop/ops/sub/kunlun/sub_kunlun.h
0 → 100644
View file @
f8e9d924
#ifndef __SUB_KUNLUN_API_H__
#define __SUB_KUNLUN_API_H__
#include "../../../elementwise/kunlun/elementwise_kunlun_api.h"
ELEMENTWISE_DESCRIPTOR
(
sub
,
kunlun
)
#endif // __SUB_KUNLUN_API_H__
src/infiniop/ops/sub/kunlun/sub_kunlun.xpu
0 → 100644
View file @
f8e9d924
#include "../../../elementwise/kunlun/elementwise_kunlun.h"
#include "kernel.h"
#include "sub_kunlun.h"
namespace op::elementwise::kunlun {
using SubOp = op::sub::kunlun::SubOp;
INSTANTIATE_ELEMENTWISE_KERNEL(SubOp::num_inputs, SubOp, float);
INSTANTIATE_ELEMENTWISE_KERNEL(SubOp::num_inputs, SubOp, half);
INSTANTIATE_ELEMENTWISE_KERNEL(SubOp::num_inputs, SubOp, bfloat16_t);
} // namespace op::elementwise::kunlun
namespace op::sub::kunlun {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::kunlun::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &a_desc = input_desc_vec.at(0);
const auto &b_desc = input_desc_vec.at(1);
const auto &c_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
const auto &b_shape = b_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
// create KUNLUN elementwise descriptor
CREATE_ELEMENTWISE_KUNLUN_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<8, SubOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<8, SubOp, bfloat16_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<8, SubOp, float>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::sub::kunlun
src/infiniop/ops/sub/operator.cc
View file @
f8e9d924
...
...
@@ -11,6 +11,9 @@
#ifdef ENABLE_METAX_API
#include "metax/sub_metax.h"
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/sub_kunlun.h"
#endif
__C
infiniStatus_t
infiniopCreateSubDescriptor
(
infiniopHandle_t
handle
,
...
...
@@ -42,6 +45,9 @@ __C infiniStatus_t infiniopCreateSubDescriptor(
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
CREATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -70,6 +76,10 @@ __C infiniStatus_t infiniopGetSubWorkspaceSize(infiniopSubDescriptor_t desc, siz
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
GET
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
...
...
@@ -106,6 +116,9 @@ __C infiniStatus_t infiniopSub(
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -136,6 +149,9 @@ infiniopDestroySubDescriptor(infiniopSubDescriptor_t desc) {
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
DELETE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment