Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
9b32b4b1
Commit
9b32b4b1
authored
Jun 04, 2025
by
Catheriany
Browse files
Merge remote-tracking branch 'origin/main' into issue/150
parents
15bcbdfc
4799ddbf
Changes
103
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
570 additions
and
48 deletions
+570
-48
src/infiniop/ops/swiglu/ascend/swiglu_ascend.h
src/infiniop/ops/swiglu/ascend/swiglu_ascend.h
+79
-0
src/infiniop/ops/swiglu/ascend/swiglu_ascend_kernel.cpp
src/infiniop/ops/swiglu/ascend/swiglu_ascend_kernel.cpp
+180
-0
src/infiniop/ops/swiglu/kunlun/swiglu_kunlun.cc
src/infiniop/ops/swiglu/kunlun/swiglu_kunlun.cc
+63
-0
src/infiniop/ops/swiglu/kunlun/swiglu_kunlun.h
src/infiniop/ops/swiglu/kunlun/swiglu_kunlun.h
+8
-0
src/infiniop/ops/swiglu/kunlun/swiglu_kunlun_internal.xpu
src/infiniop/ops/swiglu/kunlun/swiglu_kunlun_internal.xpu
+33
-0
src/infiniop/ops/swiglu/maca/swiglu_maca.h
src/infiniop/ops/swiglu/maca/swiglu_maca.h
+8
-0
src/infiniop/ops/swiglu/maca/swiglu_maca.maca
src/infiniop/ops/swiglu/maca/swiglu_maca.maca
+56
-0
src/infiniop/ops/swiglu/maca/swiglu_maca_internal.h
src/infiniop/ops/swiglu/maca/swiglu_maca_internal.h
+40
-0
src/infiniop/ops/swiglu/operator.cc
src/infiniop/ops/swiglu/operator.cc
+40
-12
src/infiniop/reduce/cuda/reduce.cuh
src/infiniop/reduce/cuda/reduce.cuh
+1
-1
src/infiniop/reduce/kunlun/reduce_kunlun.h
src/infiniop/reduce/kunlun/reduce_kunlun.h
+3
-1
src/infiniop/tensor.h
src/infiniop/tensor.h
+13
-3
src/infiniop/tensor_descriptor.cc
src/infiniop/tensor_descriptor.cc
+16
-22
src/infinirt/bang/infinirt_bang.cc
src/infinirt/bang/infinirt_bang.cc
+6
-5
src/infinirt/bang/infinirt_bang.h
src/infinirt/bang/infinirt_bang.h
+1
-1
src/infinirt/infinirt.cc
src/infinirt/infinirt.cc
+5
-1
src/utils.h
src/utils.h
+8
-0
src/utils/custom_types.cc
src/utils/custom_types.cc
+1
-1
test/infiniop-test/test_generate/__init__.py
test/infiniop-test/test_generate/__init__.py
+1
-1
test/infiniop-test/test_generate/infiniop_test.py
test/infiniop-test/test_generate/infiniop_test.py
+8
-0
No files found.
src/infiniop/ops/swiglu/ascend/swiglu_ascend.h
0 → 100644
View file @
9b32b4b1
#ifndef __ACLNN_SWIGLU_H__
#define __ACLNN_SWIGLU_H__
#include "../../../../utils.h"
#include "../../../../utils/check.h"
#include "../../../operator.h"
#include "../../../tensor.h"
namespace
op
::
swiglu
::
ascend
{
class
SwigluInfo
{
SwigluInfo
()
=
default
;
public:
infiniDtype_t
dtype
;
std
::
vector
<
size_t
>
shape
;
int32_t
ndim
;
std
::
vector
<
ptrdiff_t
>
c_strides
;
std
::
vector
<
ptrdiff_t
>
a_strides
;
std
::
vector
<
ptrdiff_t
>
b_strides
;
static
utils
::
Result
<
SwigluInfo
>
create
(
infiniopTensorDescriptor_t
c_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
)
{
CHECK_OR_RETURN
(
c_desc
&&
a_desc
&&
b_desc
,
INFINI_STATUS_BAD_PARAM
);
CHECK_OR_RETURN
(
!
c_desc
->
hasBroadcastDim
(),
INFINI_STATUS_BAD_TENSOR_STRIDES
);
CHECK_OR_RETURN
(
c_desc
->
ndim
()
==
a_desc
->
ndim
()
&&
c_desc
->
ndim
()
==
b_desc
->
ndim
()
&&
(
c_desc
->
ndim
()
==
2
||
c_desc
->
ndim
()
==
3
),
INFINI_STATUS_BAD_TENSOR_SHAPE
);
CHECK_SAME_SHAPE
(
c_desc
->
shape
(),
a_desc
->
shape
(),
b_desc
->
shape
());
int32_t
ndim
=
c_desc
->
ndim
();
CHECK_OR_RETURN
(
c_desc
->
stride
(
ndim
-
1
)
==
1
&&
a_desc
->
stride
(
ndim
-
1
)
==
1
&&
b_desc
->
stride
(
ndim
-
1
)
==
1
,
INFINI_STATUS_BAD_TENSOR_STRIDES
);
CHECK_OR_RETURN
(
c_desc
->
dtype
()
==
a_desc
->
dtype
()
&&
c_desc
->
dtype
()
==
b_desc
->
dtype
(),
INFINI_STATUS_BAD_TENSOR_DTYPE
);
return
utils
::
Result
<
SwigluInfo
>
(
SwigluInfo
{
c_desc
->
dtype
(),
c_desc
->
shape
(),
ndim
,
c_desc
->
strides
(),
a_desc
->
strides
(),
b_desc
->
strides
(),
});
}
};
class
Descriptor
final
:
public
InfiniopDescriptor
{
SwigluInfo
_info
;
size_t
_workspace_size
;
Descriptor
(
SwigluInfo
info
,
size_t
workspace_size
,
infiniDevice_t
device_type
,
int
device_id
)
:
InfiniopDescriptor
{
device_type
,
device_id
},
_info
(
info
),
_workspace_size
(
workspace_size
)
{}
public:
~
Descriptor
();
static
infiniStatus_t
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_descs
);
size_t
workspaceSize
()
const
{
return
_workspace_size
;
}
infiniStatus_t
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
c
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
;
};
extern
"C"
infiniStatus_t
swiglu_kernel_launch
(
void
*
c
,
void
*
a
,
void
*
b
,
infiniDtype_t
dtype
,
size_t
batch
,
size_t
seq
,
size_t
hd
,
ptrdiff_t
stride_batch_c
,
ptrdiff_t
stride_batch_a
,
ptrdiff_t
stride_batch_b
,
ptrdiff_t
stride_seq_c
,
ptrdiff_t
stride_seq_a
,
ptrdiff_t
stride_seq_b
,
void
*
stream
);
}
// namespace op::swiglu::ascend
#endif // __ACLNN_SWIGLU_H__
src/infiniop/ops/swiglu/ascend/swiglu_ascend_kernel.cpp
0 → 100644
View file @
9b32b4b1
#include "../../../devices/ascend/ascend_kernel_common.h"
using
namespace
AscendC
;
template
<
typename
T
>
class
SwigluKernel
{
public:
__aicore__
inline
SwigluKernel
()
{}
__aicore__
inline
void
init
(
GM_ADDR
c
,
GM_ADDR
a
,
GM_ADDR
b
,
size_t
batch_
,
size_t
seq
,
size_t
hd
,
ptrdiff_t
stride_batch_c
,
ptrdiff_t
stride_batch_a
,
ptrdiff_t
stride_batch_b
,
ptrdiff_t
stride_seq_c
,
ptrdiff_t
stride_seq_a
,
ptrdiff_t
stride_seq_b
);
__aicore__
inline
void
process
();
private:
__aicore__
inline
void
copyIn
(
size_t
i
);
__aicore__
inline
void
compute
(
size_t
i
);
__aicore__
inline
void
copyOut
(
size_t
i
);
private:
GlobalTensor
<
T
>
_c_gm
,
_a_gm
,
_b_gm
;
TQue
<
QuePosition
::
VECIN
,
BUFFER_NUM
>
_in_queue_a
,
_in_queue_b
;
TQue
<
QuePosition
::
VECOUT
,
BUFFER_NUM
>
_out_queue_c
;
TPipe
_pipe
;
float
_beta_value
=
1.0
f
;
size_t
_block_idx
,
_tile_len
,
_copy_len
,
_batch
,
_seq_len
,
_hidden_size
,
_stride_seq_a
,
_stride_seq_b
,
_stride_seq_c
;
int64_t
_stride_batch_a
=
1
,
_stride_batch_b
=
1
,
_stride_batch_c
=
1
;
};
template
<
typename
T
>
__aicore__
inline
void
SwigluKernel
<
T
>::
init
(
GM_ADDR
c
,
GM_ADDR
a
,
GM_ADDR
b
,
size_t
batch_
,
size_t
seq
,
size_t
hd
,
ptrdiff_t
stride_batch_c
,
ptrdiff_t
stride_batch_a
,
ptrdiff_t
stride_batch_b
,
ptrdiff_t
stride_seq_c
,
ptrdiff_t
stride_seq_a
,
ptrdiff_t
stride_seq_b
)
{
// Init Shape & StrideVariables
_batch
=
batch_
;
_seq_len
=
seq
;
_hidden_size
=
hd
;
_stride_batch_a
=
stride_batch_a
;
_stride_batch_b
=
stride_batch_b
;
_stride_batch_c
=
stride_batch_c
;
_stride_seq_a
=
stride_seq_a
;
_stride_seq_b
=
stride_seq_b
;
_stride_seq_c
=
stride_seq_c
;
_block_idx
=
GetBlockIdx
();
_tile_len
=
_block_idx
<
(
_hidden_size
%
BLOCK_NUM
)
?
(
_hidden_size
/
BLOCK_NUM
)
+
1
:
(
_hidden_size
/
BLOCK_NUM
);
_copy_len
=
alignTileLen
<
T
>
(
_tile_len
,
BYTE_ALIGN
);
// Set global tensor
_a_gm
.
SetGlobalBuffer
((
__gm__
T
*
)
a
);
_b_gm
.
SetGlobalBuffer
((
__gm__
T
*
)
b
);
_c_gm
.
SetGlobalBuffer
((
__gm__
T
*
)
c
);
// _pipe alloc memory to queue, the unit is bytes
_pipe
.
InitBuffer
(
_in_queue_a
,
BUFFER_NUM
,
_copy_len
*
sizeof
(
T
));
_pipe
.
InitBuffer
(
_in_queue_b
,
BUFFER_NUM
,
_copy_len
*
sizeof
(
T
));
_pipe
.
InitBuffer
(
_out_queue_c
,
BUFFER_NUM
,
_copy_len
*
sizeof
(
T
));
}
template
<
typename
T
>
__aicore__
inline
void
SwigluKernel
<
T
>::
copyIn
(
size_t
i
)
{
// Alloc tensor from queue memory
LocalTensor
<
T
>
aLocal
=
_in_queue_a
.
AllocTensor
<
T
>
();
LocalTensor
<
T
>
bLocal
=
_in_queue_b
.
AllocTensor
<
T
>
();
// Get idx of current tile
auto
batch_idx
=
_batch
==
1
?
0
:
i
/
_seq_len
;
auto
seq_idx
=
_batch
==
1
?
i
:
i
%
_seq_len
;
ptrdiff_t
idxa
=
batch_idx
*
_stride_batch_a
+
seq_idx
*
_stride_seq_a
+
_block_idx
*
_tile_len
;
ptrdiff_t
idxb
=
batch_idx
*
_stride_batch_b
+
seq_idx
*
_stride_seq_b
+
_block_idx
*
_tile_len
;
// Copy process_th tile from global tensor to local tensor
DataCopy
(
aLocal
,
_a_gm
[
idxa
],
_copy_len
);
DataCopy
(
bLocal
,
_b_gm
[
idxb
],
_copy_len
);
// Enque input tensor to VECIN queue
_in_queue_a
.
EnQue
(
aLocal
);
_in_queue_b
.
EnQue
(
bLocal
);
}
template
<
typename
T
>
__aicore__
inline
void
SwigluKernel
<
T
>::
compute
(
size_t
i
)
{
// Deque input tensors from VECIN queue
LocalTensor
<
T
>
aLocal
=
_in_queue_a
.
DeQue
<
T
>
();
LocalTensor
<
T
>
bLocal
=
_in_queue_b
.
DeQue
<
T
>
();
LocalTensor
<
T
>
cLocal
=
_out_queue_c
.
AllocTensor
<
T
>
();
// Call SwiGLU ascend api
SwiGLU
<
T
,
false
>
(
cLocal
,
aLocal
,
bLocal
,
_beta_value
,
_copy_len
);
// Enque result and free input
_out_queue_c
.
EnQue
<
T
>
(
cLocal
);
_in_queue_a
.
FreeTensor
(
aLocal
);
_in_queue_b
.
FreeTensor
(
bLocal
);
}
template
<
typename
T
>
__aicore__
inline
void
SwigluKernel
<
T
>::
copyOut
(
size_t
i
)
{
// Deque output tensor from VECOUT queue
LocalTensor
<
T
>
cLocal
=
_out_queue_c
.
DeQue
<
T
>
();
auto
batch_idx
=
_batch
==
1
?
0
:
i
/
_seq_len
;
auto
seq_idx
=
_batch
==
1
?
i
:
i
%
_seq_len
;
ptrdiff_t
idxc
=
batch_idx
*
_stride_batch_c
+
seq_idx
*
_stride_seq_c
+
_block_idx
*
_tile_len
;
// Copy progress_th tile from local tensor to global tensor
if
(
_tile_len
*
sizeof
(
T
)
%
BYTE_ALIGN
!=
0
)
{
DataCopyExtParams
dcep
=
{
1
,
static_cast
<
uint32_t
>
(
_tile_len
*
sizeof
(
T
)),
0
,
0
,
0
};
DataCopyPad
(
_c_gm
[
idxc
],
cLocal
,
dcep
);
}
else
{
DataCopy
(
_c_gm
[
idxc
],
cLocal
,
_tile_len
);
}
// Free output Local tensor
_out_queue_c
.
FreeTensor
(
cLocal
);
}
template
<
typename
T
>
__aicore__
inline
void
SwigluKernel
<
T
>::
process
()
{
for
(
size_t
i
=
0
;
i
<
_batch
*
_seq_len
;
++
i
)
{
copyIn
(
i
);
compute
(
i
);
copyOut
(
i
);
}
}
#define DEFINE_SWIGLU_KERNEL(KERNEL_NAME, TYPE) \
__global__ __aicore__ void KERNEL_NAME(GM_ADDR c, GM_ADDR a, GM_ADDR b, \
size_t batch, size_t seq, size_t hd, \
ptrdiff_t stride_batch_c, \
ptrdiff_t stride_batch_a, \
ptrdiff_t stride_batch_b, \
ptrdiff_t stride_seq_c, \
ptrdiff_t stride_seq_a, \
ptrdiff_t stride_seq_b) { \
SwigluKernel<TYPE> op; \
op.init(c, a, b, \
batch, seq, hd, \
stride_batch_c, stride_batch_a, stride_batch_b, \
stride_seq_c, stride_seq_a, stride_seq_b); \
op.process(); \
}
DEFINE_SWIGLU_KERNEL
(
swiglu_kernel_half
,
half
)
DEFINE_SWIGLU_KERNEL
(
swiglu_kernel_float
,
float
)
#undef DEFINE_SWIGLU_KERNEL
extern
"C"
infiniStatus_t
swiglu_kernel_launch
(
void
*
c
,
void
*
a
,
void
*
b
,
infiniDtype_t
dtype
,
size_t
batch
,
size_t
seq
,
size_t
hd
,
ptrdiff_t
stride_batch_c
,
ptrdiff_t
stride_batch_a
,
ptrdiff_t
stride_batch_b
,
ptrdiff_t
stride_seq_c
,
ptrdiff_t
stride_seq_a
,
ptrdiff_t
stride_seq_b
,
void
*
stream
)
{
#define LAUNCH_SWIGLU_KERNEL(DTYPE_ENUM, KERNEL_NAME) \
case DTYPE_ENUM: \
KERNEL_NAME<<<BLOCK_NUM, nullptr, stream>>>( \
c, a, b, \
batch, \
seq, \
hd, \
stride_batch_c, stride_batch_a, stride_batch_b, \
stride_seq_c, stride_seq_a, stride_seq_b); \
return INFINI_STATUS_SUCCESS;
switch
(
dtype
)
{
LAUNCH_SWIGLU_KERNEL
(
INFINI_DTYPE_F16
,
swiglu_kernel_half
)
LAUNCH_SWIGLU_KERNEL
(
INFINI_DTYPE_F32
,
swiglu_kernel_float
)
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
#undef LAUNCH_SWIGLU_KERNEL
}
src/infiniop/ops/swiglu/kunlun/swiglu_kunlun.cc
0 → 100644
View file @
9b32b4b1
#include "swiglu_kunlun.h"
// Op interface declare
LAUNCH_ELEMENTWISE_KERNEL
(
SwiGLU
)
namespace
op
::
swiglu
::
kunlun
{
typedef
struct
SwiGLUOp
{
static
constexpr
size_t
num_inputs
=
2
;
template
<
typename
Tdata
,
typename
...
Args
>
static
infiniStatus_t
launch
(
Args
...
args
)
{
launchSwiGLUKernel
<
Tdata
>
(
args
...);
return
INFINI_STATUS_SUCCESS
;
}
}
SwiGLUOp
;
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
)
{
auto
handle
=
reinterpret_cast
<
device
::
kunlun
::
Handle
*>
(
handle_
);
auto
dtype
=
out_desc
->
dtype
();
const
auto
&
up_desc
=
input_desc_vec
.
at
(
0
);
const
auto
&
gate_desc
=
input_desc_vec
.
at
(
1
);
const
auto
&
out_shape
=
out_desc
->
shape
();
const
auto
&
up_shape
=
up_desc
->
shape
();
const
auto
&
gate_shape
=
gate_desc
->
shape
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F32
);
CHECK_SAME_SHAPE
(
out_shape
,
up_shape
,
gate_shape
);
// create KUNLUN elementwise descriptor
CREATE_ELEMENTWISE_KUNLUN_DESCRIPTOR
(
handle
,
dtype
,
out_desc
,
input_desc_vec
)
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
{
if
(
workspace_size
<
_workspace_size
)
{
return
INFINI_STATUS_INSUFFICIENT_WORKSPACE
;
}
switch
(
_dtype
)
{
case
INFINI_DTYPE_F32
:
return
_device_info
->
calculate
<
SwiGLUOp
,
float
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::swiglu::kunlun
src/infiniop/ops/swiglu/kunlun/swiglu_kunlun.h
0 → 100644
View file @
9b32b4b1
#ifndef __SWIGLU_KUNLUN_H__
#define __SWIGLU_KUNLUN_H__
#include "../../../elementwise/kunlun/elementwise_kunlun.h"
ELEMENTWISE_DESCRIPTOR
(
swiglu
,
kunlun
)
#endif // __SWIGLU_KUNLUN_H__
src/infiniop/ops/swiglu/kunlun/swiglu_kunlun_internal.xpu
0 → 100644
View file @
9b32b4b1
#ifndef __SWIGLU_KUNLUN_H__
#define __SWIGLU_KUNLUN_H__
#include "../../../devices/kunlun/kunlun_kernel_common.h"
#include "../../../elementwise/kunlun/elementwise_kunlun_kernel.h"
/// @brief Define swiglu op for local mem
typedef struct SwiGLUOp {
private:
template <typename T>
inline __device__ T sigmoid(T x) const {
return 1.0f / (1.0f + exp(-x));
}
public:
// This static number must be set in other Ops
static constexpr size_t num_inputs = 2;
template <typename T>
inline __device__ T operator()(const T *inputs) const {
T up = inputs[0];
T gate = inputs[1];
T out = gate * sigmoid(gate) * up;
return out;
}
} SwiGLUOp;
// Definition for swiglu kernel interface
LAUNCH_ELEMENTWISE_KERNEL_IMPL(SwiGLU, SwiGLUOp)
// Template instantiate
LAUNCH_ELEMENTWISE_KERNEL_INSTANTIATE(SwiGLU, float)
#endif // __SWIGLU_KUNLUN_H__
src/infiniop/ops/swiglu/maca/swiglu_maca.h
0 → 100644
View file @
9b32b4b1
#ifndef __SWIGLU_MACA_API_H__
#define __SWIGLU_MACA_API_H__
#include "../../../elementwise/maca/elementwise_maca_api.h"
ELEMENTWISE_DESCRIPTOR
(
swiglu
,
maca
)
#endif // __SWIGLU_MACA_API_H__
src/infiniop/ops/swiglu/maca/swiglu_maca.maca
0 → 100644
View file @
9b32b4b1
#include "swiglu_maca.h"
#include "swiglu_maca_internal.h"
namespace op::swiglu::maca {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::maca::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &up_desc = input_desc_vec.at(0);
const auto &gate_desc = input_desc_vec.at(1);
const auto &out_shape = out_desc->shape();
const auto &up_shape = up_desc->shape();
const auto &gate_shape = gate_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_SAME_SHAPE(out_shape, up_shape, gate_shape);
// create MACA elementwise descriptor
CREATE_ELEMENTWISE_MACA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, SwiGLUOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, SwiGLUOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, SwiGLUOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::swiglu::maca
src/infiniop/ops/swiglu/maca/swiglu_maca_internal.h
0 → 100644
View file @
9b32b4b1
#ifndef __SWIGLU_MACA_H__
#define __SWIGLU_MACA_H__
#include "../../../elementwise/maca/elementwise_maca.h"
#include <hctlass/half.h>
namespace
op
::
swiglu
::
maca
{
typedef
struct
SwiGLUOp
{
private:
template
<
typename
T
>
__device__
__forceinline__
T
sigmoid
(
const
T
&
x
)
const
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
)
{
return
h2rcp
(
__hadd2
(
make_half2
(
1
,
1
),
h2exp
(
__hneg2
(
x
))));
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
return
hrcp
(
__hadd
(
half
(
1.
f
),
__float2half
(
__expf
(
__half2float
(
__hneg
(
x
))))));
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
return
__frcp_rn
(
__fadd_rn
(
1
,
__expf
(
-
x
)));
}
else
{
return
1
/
(
1
+
std
::
exp
(
-
x
));
}
}
public:
static
constexpr
size_t
num_inputs
=
2
;
template
<
typename
T
>
__device__
__forceinline__
T
operator
()(
const
T
&
up
,
const
T
&
gate
)
const
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
)
{
return
__hmul2
(
__hmul2
(
gate
,
sigmoid
(
gate
)),
up
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
return
__hmul
(
__hmul
(
gate
,
sigmoid
(
gate
)),
up
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
return
__fmul_rn
(
__fmul_rn
(
gate
,
sigmoid
(
gate
)),
up
);
}
else
{
return
gate
*
sigmoid
(
gate
)
*
up
;
}
}
}
SwiGLUOp
;
}
// namespace op::swiglu::maca
#endif
src/infiniop/ops/swiglu/operator.cc
View file @
9b32b4b1
...
@@ -8,6 +8,15 @@
...
@@ -8,6 +8,15 @@
#ifdef ENABLE_CUDA_API
#ifdef ENABLE_CUDA_API
#include "cuda/swiglu_cuda.cuh"
#include "cuda/swiglu_cuda.cuh"
#endif
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/swiglu_kunlun.h"
#endif
#ifdef ENABLE_METAX_API
#include "maca/swiglu_maca.h"
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/swiglu_ascend.h"
#endif
__C
infiniStatus_t
infiniopCreateSwiGLUDescriptor
(
__C
infiniStatus_t
infiniopCreateSwiGLUDescriptor
(
infiniopHandle_t
handle
,
infiniopHandle_t
handle
,
...
@@ -33,6 +42,12 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
...
@@ -33,6 +42,12 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
#ifdef ENABLE_CUDA_API
#ifdef ENABLE_CUDA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
CREATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
#endif
#ifdef ENABLE_KUNLUN_API
CREATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
maca
);
#endif
#ifdef ENABLE_CAMBRICON_MLU
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
case
DevCambriconMlu
:
{
return
bangCreateSwiGLUDescriptor
((
BangHandle_t
)
handle
,
return
bangCreateSwiGLUDescriptor
((
BangHandle_t
)
handle
,
...
@@ -40,11 +55,8 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
...
@@ -40,11 +55,8 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
c_desc
,
a_desc
,
b_desc
);
c_desc
,
a_desc
,
b_desc
);
}
}
#endif
#endif
#ifdef ENABLE_ASCEND_NPU
#ifdef ENABLE_ASCEND_API
case
DevAscendNpu
:
CREATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
return
ascendCreateSwiGLUDescriptor
(
(
AscendHandle_t
)
handle
,
(
SwiGLUAscendDescriptor_t
*
)
desc_ptr
,
c_desc
,
a_desc
,
b_desc
);
#endif
#endif
#ifdef ENABLE_METAX_GPU
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
case
DevMetaxGpu
:
{
...
@@ -80,12 +92,18 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
...
@@ -80,12 +92,18 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
#ifdef ENABLE_CUDA_API
#ifdef ENABLE_CUDA_API
GET
(
INFINI_DEVICE_NVIDIA
,
cuda
)
GET
(
INFINI_DEVICE_NVIDIA
,
cuda
)
#endif
#endif
#ifdef ENABLE_KUNLUN_API
GET
(
INFINI_DEVICE_KUNLUN
,
kunlun
)
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
maca
);
#endif
#ifdef ENABLE_CAMBRICON_MLU
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
case
DevCambriconMlu
:
{
return
bangGetSwiGLUWorkspaceSize
((
SwiGLUBangDescriptor_t
)
desc
,
size
);
return
bangGetSwiGLUWorkspaceSize
((
SwiGLUBangDescriptor_t
)
desc
,
size
);
}
}
#endif
#endif
#ifdef ENABLE_ASCEND_
NPU
#ifdef ENABLE_ASCEND_
API
GET
(
INFINI_DEVICE_ASCEND
,
ascend
)
GET
(
INFINI_DEVICE_ASCEND
,
ascend
)
#endif
#endif
#ifdef ENABLE_METAX_GPU
#ifdef ENABLE_METAX_GPU
...
@@ -127,14 +145,19 @@ __C infiniStatus_t infiniopSwiGLU(
...
@@ -127,14 +145,19 @@ __C infiniStatus_t infiniopSwiGLU(
#ifdef ENABLE_CUDA_API
#ifdef ENABLE_CUDA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
maca
);
#endif
#ifdef ENABLE_CAMBRICON_MLU
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
case
DevCambriconMlu
:
{
return
bangSwiGLU
((
SwiGLUBangDescriptor_t
)
desc
,
c
,
a
,
b
,
stream
);
return
bangSwiGLU
((
SwiGLUBangDescriptor_t
)
desc
,
c
,
a
,
b
,
stream
);
}
}
#endif
#endif
#ifdef ENABLE_ASCEND_NPU
#ifdef ENABLE_ASCEND_API
case
DevAscendNpu
:
CALCULATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
return
ascendSwiGLU
((
SwiGLUAscendDescriptor_t
)
desc
,
c
,
a
,
b
,
stream
);
#endif
#endif
#ifdef ENABLE_METAX_GPU
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
case
DevMetaxGpu
:
...
@@ -168,14 +191,19 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
...
@@ -168,14 +191,19 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
#ifdef ENABLE_CUDA_API
#ifdef ENABLE_CUDA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
DELETE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
#endif
#ifdef ENABLE_KUNLUN_API
DELETE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
maca
);
#endif
#ifdef ENABLE_CAMBRICON_MLU
#ifdef ENABLE_CAMBRICON_MLU
case
DevCambriconMlu
:
{
case
DevCambriconMlu
:
{
return
bangDestroySwiGLUDescriptor
((
SwiGLUBangDescriptor_t
)
desc
);
return
bangDestroySwiGLUDescriptor
((
SwiGLUBangDescriptor_t
)
desc
);
}
}
#endif
#endif
#ifdef ENABLE_ASCEND_NPU
#ifdef ENABLE_ASCEND_API
case
DevAscendNpu
:
DELETE
(
INFINI_DEVICE_ASCEND
,
ascend
)
return
ascendDestroySwiGLUDescriptor
((
SwiGLUAscendDescriptor_t
)
desc
);
#endif
#endif
#ifdef ENABLE_METAX_GPU
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
case
DevMetaxGpu
:
...
...
src/infiniop/reduce/cuda/reduce.cuh
View file @
9b32b4b1
...
@@ -18,7 +18,7 @@ __device__ __forceinline__ Tcompute sumSquared(const Tdata *data_ptr, size_t cou
...
@@ -18,7 +18,7 @@ __device__ __forceinline__ Tcompute sumSquared(const Tdata *data_ptr, size_t cou
// Each thread computes its partial sum
// Each thread computes its partial sum
for
(
size_t
i
=
threadIdx
.
x
;
i
<
count
;
i
+=
BLOCK_SIZE
)
{
for
(
size_t
i
=
threadIdx
.
x
;
i
<
count
;
i
+=
BLOCK_SIZE
)
{
ss
+=
Tcompute
(
data_ptr
[
i
]
*
data_ptr
[
i
]);
ss
+=
Tcompute
(
data_ptr
[
i
]
)
*
Tcompute
(
data_ptr
[
i
]);
}
}
// Use CUB block-level reduction
// Use CUB block-level reduction
...
...
src/infiniop/reduce/kunlun/reduce_kunlun.h
View file @
9b32b4b1
#ifndef __INFINIOP_REDUCE_KUNLUN_H__
#ifndef __INFINIOP_REDUCE_KUNLUN_H__
#define __INFINIOP_REDUCE_KUNLUN_H__
#define __INFINIOP_REDUCE_KUNLUN_H__
#include "../../devices/kunlun/kunlun_common.h"
#include "../../devices/kunlun/kunlun_
kernel_
common.h"
namespace
op
::
common_kunlun
::
reduce_op
{
namespace
op
::
common_kunlun
::
reduce_op
{
using
namespace
device
::
kunlun
::
kernel
;
// Use 16 floats instruction to calculate reduce
// Use 16 floats instruction to calculate reduce
// data_ptr is the pointer of LM
// data_ptr is the pointer of LM
static
inline
__device__
float
sumSquaredF32
(
float
*
data_ptr
,
int
count
)
{
static
inline
__device__
float
sumSquaredF32
(
float
*
data_ptr
,
int
count
)
{
...
...
src/infiniop/tensor.h
View file @
9b32b4b1
...
@@ -2,9 +2,19 @@
...
@@ -2,9 +2,19 @@
#define __INFINIOP_TENSOR_H__
#define __INFINIOP_TENSOR_H__
#include "infiniop/tensor_descriptor.h"
#include "infiniop/tensor_descriptor.h"
#include "../utils.h"
#include <string>
#include <string>
#include <vector>
#include <vector>
#define TRANSFORM_TENSOR_DESC(__TENSOR_DESC__, __OP__) \
do { \
auto __RESULT__ = __TENSOR_DESC__->__OP__; \
CHECK_RESULT(__RESULT__); \
__TENSOR_DESC__ = __RESULT__.take(); \
} while (0)
struct
InfiniopTensorDescriptor
{
struct
InfiniopTensorDescriptor
{
private:
private:
// Datatype
// Datatype
...
@@ -32,9 +42,9 @@ public:
...
@@ -32,9 +42,9 @@ public:
bool
hasBroadcastDim
()
const
;
bool
hasBroadcastDim
()
const
;
std
::
vector
<
size_t
>
getBroadcastDim
()
const
;
std
::
vector
<
size_t
>
getBroadcastDim
()
const
;
infiniopTensorDescriptor_t
dimMerge
(
size_t
dim_start
,
size_t
dim_end
)
const
;
utils
::
Result
<
infiniopTensorDescriptor_t
>
dimMerge
(
size_t
dim_start
,
size_t
dim_end
)
const
;
infiniopTensorDescriptor_t
dimSplit
(
size_t
axis
,
const
std
::
vector
<
size_t
>
&
dims
)
const
;
utils
::
Result
<
infiniopTensorDescriptor_t
>
dimSplit
(
size_t
axis
,
const
std
::
vector
<
size_t
>
&
dims
)
const
;
infiniopTensorDescriptor_t
dimPermute
(
const
std
::
vector
<
size_t
>
&
order
)
const
;
utils
::
Result
<
infiniopTensorDescriptor_t
>
dimPermute
(
const
std
::
vector
<
size_t
>
&
order
)
const
;
std
::
string
toString
()
const
;
std
::
string
toString
()
const
;
};
};
...
...
src/infiniop/tensor_descriptor.cc
View file @
9b32b4b1
...
@@ -12,7 +12,7 @@ __C __export infiniStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescrip
...
@@ -12,7 +12,7 @@ __C __export infiniStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescrip
std
::
vector
<
ptrdiff_t
>
strides
(
ndim
);
std
::
vector
<
ptrdiff_t
>
strides
(
ndim
);
ptrdiff_t
dsize
=
1
;
ptrdiff_t
dsize
=
1
;
if
(
ndim
>
0
)
{
if
(
ndim
>
0
)
{
for
(
size_
t
i
=
ndim
-
1
;
i
>=
0
;
i
--
)
{
for
(
in
t
i
=
(
int
)
ndim
-
1
;
i
>=
0
;
i
--
)
{
strides
[
i
]
=
dsize
;
strides
[
i
]
=
dsize
;
dsize
*=
shape_
[
i
];
dsize
*=
shape_
[
i
];
}
}
...
@@ -104,10 +104,8 @@ std::vector<size_t> InfiniopTensorDescriptor::getBroadcastDim() const {
...
@@ -104,10 +104,8 @@ std::vector<size_t> InfiniopTensorDescriptor::getBroadcastDim() const {
return
res
;
return
res
;
}
}
infiniopTensorDescriptor_t
InfiniopTensorDescriptor
::
dimMerge
(
size_t
dim_start
,
size_t
dim_end
)
const
{
utils
::
Result
<
infiniopTensorDescriptor_t
>
InfiniopTensorDescriptor
::
dimMerge
(
size_t
dim_start
,
size_t
dim_end
)
const
{
if
(
dim_start
>
dim_end
||
dim_end
>=
ndim
())
{
CHECK_OR_RETURN
(
dim_start
<=
dim_end
&&
dim_end
<
ndim
(),
INFINI_STATUS_BAD_PARAM
);
return
nullptr
;
}
size_t
new_ndim
=
ndim
()
-
(
dim_end
-
dim_start
);
size_t
new_ndim
=
ndim
()
-
(
dim_end
-
dim_start
);
std
::
vector
<
size_t
>
new_shape
(
new_ndim
);
std
::
vector
<
size_t
>
new_shape
(
new_ndim
);
...
@@ -120,9 +118,7 @@ infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimMerge(size_t dim_start,
...
@@ -120,9 +118,7 @@ infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimMerge(size_t dim_start,
index
++
;
index
++
;
}
}
if
(
!
isContiguous
(
dim_start
,
dim_end
))
{
CHECK_OR_RETURN
(
isContiguous
(
dim_start
,
dim_end
),
INFINI_STATUS_BAD_PARAM
);
return
nullptr
;
}
new_shape
[
index
]
=
1
;
new_shape
[
index
]
=
1
;
for
(
size_t
i
=
dim_start
;
i
<=
dim_end
;
i
++
)
{
for
(
size_t
i
=
dim_start
;
i
<=
dim_end
;
i
++
)
{
...
@@ -138,15 +134,15 @@ infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimMerge(size_t dim_start,
...
@@ -138,15 +134,15 @@ infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimMerge(size_t dim_start,
index
++
;
index
++
;
}
}
return
new
InfiniopTensorDescriptor
(
_dtype
,
new_ndim
,
new_shape
.
data
(),
new_strides
.
data
());
return
utils
::
Result
<
infiniopTensorDescriptor_t
>
(
new
InfiniopTensorDescriptor
(
_dtype
,
new_ndim
,
new_shape
.
data
(),
new_strides
.
data
()));
}
}
infiniopTensorDescriptor_t
InfiniopTensorDescriptor
::
dimSplit
(
size_t
axis
,
const
std
::
vector
<
size_t
>
&
dims
)
const
{
utils
::
Result
<
infiniopTensorDescriptor_t
>
InfiniopTensorDescriptor
::
dimSplit
(
size_t
axis
,
const
std
::
vector
<
size_t
>
&
dims
)
const
{
size_t
ndim_
=
ndim
();
size_t
ndim_
=
ndim
();
if
(
dim
(
axis
)
!=
std
::
accumulate
(
dims
.
begin
(),
dims
.
end
(),
(
size_t
)
1
,
std
::
multiplies
<
size_t
>
()))
{
CHECK_OR_RETURN
(
dim
(
axis
)
==
std
::
accumulate
(
dims
.
begin
(),
dims
.
end
(),
(
size_t
)
1
,
std
::
multiplies
<
size_t
>
()),
return
nullptr
;
INFINI_STATUS_BAD_PARAM
);
}
size_t
new_ndim
=
ndim_
+
dims
.
size
()
-
1
;
size_t
new_ndim
=
ndim_
+
dims
.
size
()
-
1
;
std
::
vector
<
size_t
>
new_shape
(
new_ndim
);
std
::
vector
<
size_t
>
new_shape
(
new_ndim
);
...
@@ -168,24 +164,22 @@ infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimSplit(size_t axis, const
...
@@ -168,24 +164,22 @@ infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimSplit(size_t axis, const
index
++
;
index
++
;
}
}
return
new
InfiniopTensorDescriptor
(
_dtype
,
new_ndim
,
new_shape
.
data
(),
new_strides
.
data
());
return
utils
::
Result
<
infiniopTensorDescriptor_t
>
(
new
InfiniopTensorDescriptor
(
_dtype
,
new_ndim
,
new_shape
.
data
(),
new_strides
.
data
()));
}
}
infiniopTensorDescriptor_t
InfiniopTensorDescriptor
::
dimPermute
(
const
std
::
vector
<
size_t
>
&
order
)
const
{
utils
::
Result
<
infiniopTensorDescriptor_t
>
InfiniopTensorDescriptor
::
dimPermute
(
const
std
::
vector
<
size_t
>
&
order
)
const
{
auto
ndim_
=
ndim
();
auto
ndim_
=
ndim
();
if
(
order
.
size
()
!=
ndim_
)
{
CHECK_OR_RETURN
(
order
.
size
()
==
ndim_
,
INFINI_STATUS_BAD_PARAM
);
return
nullptr
;
}
std
::
vector
<
size_t
>
new_shape
(
ndim_
);
std
::
vector
<
size_t
>
new_shape
(
ndim_
);
std
::
vector
<
ptrdiff_t
>
new_strides
(
ndim_
);
std
::
vector
<
ptrdiff_t
>
new_strides
(
ndim_
);
for
(
size_t
i
=
0
;
i
<
ndim_
;
i
++
)
{
for
(
size_t
i
=
0
;
i
<
ndim_
;
i
++
)
{
if
(
std
::
find
(
order
.
begin
(),
order
.
end
(),
i
)
==
order
.
end
())
{
CHECK_OR_RETURN
(
std
::
find
(
order
.
begin
(),
order
.
end
(),
i
)
!=
order
.
end
(),
INFINI_STATUS_BAD_PARAM
);
return
nullptr
;
}
new_shape
[
i
]
=
dim
(
order
[
i
]);
new_shape
[
i
]
=
dim
(
order
[
i
]);
new_strides
[
i
]
=
stride
(
order
[
i
]);
new_strides
[
i
]
=
stride
(
order
[
i
]);
}
}
return
new
InfiniopTensorDescriptor
(
_dtype
,
ndim_
,
new_shape
.
data
(),
new_strides
.
data
());
return
utils
::
Result
<
infiniopTensorDescriptor_t
>
(
new
InfiniopTensorDescriptor
(
_dtype
,
ndim_
,
new_shape
.
data
(),
new_strides
.
data
()));
}
}
std
::
string
InfiniopTensorDescriptor
::
toString
()
const
{
std
::
string
InfiniopTensorDescriptor
::
toString
()
const
{
...
...
src/infinirt/bang/infinirt_bang.cc
View file @
9b32b4b1
...
@@ -6,7 +6,8 @@
...
@@ -6,7 +6,8 @@
namespace
infinirt
::
bang
{
namespace
infinirt
::
bang
{
infiniStatus_t
getDeviceCount
(
int
*
count
)
{
infiniStatus_t
getDeviceCount
(
int
*
count
)
{
CHECK_BANGRT
(
cnrtGetDeviceCount
(
count
));
unsigned
int
device_count
=
static_cast
<
unsigned
int
>
(
*
count
);
CHECK_BANGRT
(
cnrtGetDeviceCount
(
&
device_count
));
return
INFINI_STATUS_SUCCESS
;
return
INFINI_STATUS_SUCCESS
;
}
}
...
@@ -22,7 +23,7 @@ infiniStatus_t deviceSynchronize() {
...
@@ -22,7 +23,7 @@ infiniStatus_t deviceSynchronize() {
infiniStatus_t
streamCreate
(
infinirtStream_t
*
stream_ptr
)
{
infiniStatus_t
streamCreate
(
infinirtStream_t
*
stream_ptr
)
{
cnrtQueue_t
queue
;
cnrtQueue_t
queue
;
CHECK_BANGRT
(
cnrtQueueCreate
(
&
stream
));
CHECK_BANGRT
(
cnrtQueueCreate
(
&
queue
));
*
stream_ptr
=
queue
;
*
stream_ptr
=
queue
;
return
INFINI_STATUS_SUCCESS
;
return
INFINI_STATUS_SUCCESS
;
}
}
...
@@ -55,7 +56,7 @@ infiniStatus_t eventRecord(infinirtEvent_t event, infinirtStream_t stream) {
...
@@ -55,7 +56,7 @@ infiniStatus_t eventRecord(infinirtEvent_t event, infinirtStream_t stream) {
}
}
infiniStatus_t
eventQuery
(
infinirtEvent_t
event
,
infinirtEventStatus_t
*
status_ptr
)
{
infiniStatus_t
eventQuery
(
infinirtEvent_t
event
,
infinirtEventStatus_t
*
status_ptr
)
{
auto
status
=
cnrtQueryNotifier
((
cnrt
Queue_t
)
stream
);
auto
status
=
cnrtQueryNotifier
((
cnrt
Notifier_t
)
event
);
if
(
status
==
cnrtSuccess
)
{
if
(
status
==
cnrtSuccess
)
{
*
status_ptr
=
INFINIRT_EVENT_COMPLETE
;
*
status_ptr
=
INFINIRT_EVENT_COMPLETE
;
}
else
if
(
status
==
cnrtErrorBusy
)
{
}
else
if
(
status
==
cnrtErrorBusy
)
{
...
@@ -112,12 +113,12 @@ cnrtMemTransDir_t toBangMemcpyKind(infinirtMemcpyKind_t kind) {
...
@@ -112,12 +113,12 @@ cnrtMemTransDir_t toBangMemcpyKind(infinirtMemcpyKind_t kind) {
}
}
infiniStatus_t
memcpy
(
void
*
dst
,
const
void
*
src
,
size_t
size
,
infinirtMemcpyKind_t
kind
)
{
infiniStatus_t
memcpy
(
void
*
dst
,
const
void
*
src
,
size_t
size
,
infinirtMemcpyKind_t
kind
)
{
CHECK_BANGRT
(
cnrtMemcpy
(
dst
,
src
,
size
,
toBangMemcpyKind
(
kind
)));
CHECK_BANGRT
(
cnrtMemcpy
(
dst
,
(
void
*
)
src
,
size
,
toBangMemcpyKind
(
kind
)));
return
INFINI_STATUS_SUCCESS
;
return
INFINI_STATUS_SUCCESS
;
}
}
infiniStatus_t
memcpyAsync
(
void
*
dst
,
const
void
*
src
,
size_t
size
,
infinirtMemcpyKind_t
kind
,
infinirtStream_t
stream
)
{
infiniStatus_t
memcpyAsync
(
void
*
dst
,
const
void
*
src
,
size_t
size
,
infinirtMemcpyKind_t
kind
,
infinirtStream_t
stream
)
{
CHECK_BANGRT
(
cnrtMemcpyAsync_V2
(
dst
,
src
,
size
,
(
cnrtQueue_t
)
stream
,
toBangMemcpyKind
(
kind
)));
CHECK_BANGRT
(
cnrtMemcpyAsync_V2
(
dst
,
(
void
*
)
src
,
size
,
(
cnrtQueue_t
)
stream
,
toBangMemcpyKind
(
kind
)));
return
INFINI_STATUS_SUCCESS
;
return
INFINI_STATUS_SUCCESS
;
}
}
...
...
src/infinirt/bang/infinirt_bang.h
View file @
9b32b4b1
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#include "../infinirt_impl.h"
#include "../infinirt_impl.h"
namespace
infinirt
::
bang
{
namespace
infinirt
::
bang
{
#ifdef ENABLE_
BANG
_API
#ifdef ENABLE_
CAMBRICON
_API
INFINIRT_DEVICE_API_IMPL
INFINIRT_DEVICE_API_IMPL
#else
#else
INFINIRT_DEVICE_API_NOOP
INFINIRT_DEVICE_API_NOOP
...
...
src/infinirt/infinirt.cc
View file @
9b32b4b1
...
@@ -4,6 +4,7 @@
...
@@ -4,6 +4,7 @@
#include "bang/infinirt_bang.h"
#include "bang/infinirt_bang.h"
#include "cpu/infinirt_cpu.h"
#include "cpu/infinirt_cpu.h"
#include "cuda/infinirt_cuda.cuh"
#include "cuda/infinirt_cuda.cuh"
#include "kunlun/infinirt_kunlun.h"
#include "maca/infinirt_maca.h"
#include "maca/infinirt_maca.h"
#include "musa/infinirt_musa.h"
#include "musa/infinirt_musa.h"
...
@@ -66,8 +67,11 @@ __C infiniStatus_t infinirtGetDevice(infiniDevice_t *device_ptr, int *device_id_
...
@@ -66,8 +67,11 @@ __C infiniStatus_t infinirtGetDevice(infiniDevice_t *device_ptr, int *device_id_
case INFINI_DEVICE_MOORE: \
case INFINI_DEVICE_MOORE: \
_status = infinirt::musa::API PARAMS; \
_status = infinirt::musa::API PARAMS; \
break; \
break; \
case INFINI_DEVICE_KUNLUN: \
_status = infinirt::kunlun::API PARAMS; \
break; \
default: \
default: \
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
\
_status =
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \
} \
} \
{ ACTION; } \
{ ACTION; } \
return _status; \
return _status; \
...
...
src/utils.h
View file @
9b32b4b1
...
@@ -100,4 +100,12 @@ inline std::string infiniDtypeToString(infiniDtype_t dtype) {
...
@@ -100,4 +100,12 @@ inline std::string infiniDtypeToString(infiniDtype_t dtype) {
#define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
#define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
namespace
utils
{
inline
size_t
align
(
size_t
size
,
size_t
alignment
)
{
return
(
size
+
alignment
-
1
)
&
~
(
alignment
-
1
);
}
}
// namespace utils
#endif
#endif
src/utils/custom_types.cc
View file @
9b32b4b1
...
@@ -43,7 +43,7 @@ fp16_t _f32_to_f16(float val) {
...
@@ -43,7 +43,7 @@ fp16_t _f32_to_f16(float val) {
int32_t
exponent
=
((
f32
>>
23
)
&
0xFF
)
-
127
;
// Extract and de-bias the exponent
int32_t
exponent
=
((
f32
>>
23
)
&
0xFF
)
-
127
;
// Extract and de-bias the exponent
uint32_t
mantissa
=
f32
&
0x7FFFFF
;
// Extract the mantissa (fraction part)
uint32_t
mantissa
=
f32
&
0x7FFFFF
;
// Extract the mantissa (fraction part)
if
(
exponent
>=
3
1
)
{
// Special cases for Inf and NaN
if
(
exponent
>=
1
6
)
{
// Special cases for Inf and NaN
// NaN
// NaN
if
(
exponent
==
128
&&
mantissa
!=
0
)
{
if
(
exponent
==
128
&&
mantissa
!=
0
)
{
return
fp16_t
{
static_cast
<
uint16_t
>
(
sign
|
0x7E00
)};
return
fp16_t
{
static_cast
<
uint16_t
>
(
sign
|
0x7E00
)};
...
...
test/infiniop-test/test_generate/__init__.py
View file @
9b32b4b1
from
.infiniop_test
import
InfiniopTestCase
,
InfiniopTestWriter
,
np_dtype_to_ggml
,
gguf_strides
from
.infiniop_test
import
InfiniopTestCase
,
InfiniopTestWriter
,
np_dtype_to_ggml
,
gguf_strides
,
contiguous_gguf_strides
test/infiniop-test/test_generate/infiniop_test.py
View file @
9b32b4b1
...
@@ -29,6 +29,14 @@ def gguf_strides(*args: int) -> list[int] | None:
...
@@ -29,6 +29,14 @@ def gguf_strides(*args: int) -> list[int] | None:
return
list
(
args
)[::
-
1
]
if
args
else
None
return
list
(
args
)[::
-
1
]
if
args
else
None
def
contiguous_gguf_strides
(
shape
:
tuple
[
int
,
...])
->
list
[
int
]:
strides
=
[]
acc
=
1
for
size
in
reversed
(
shape
):
strides
.
append
(
acc
)
acc
*=
size
return
strides
[::
-
1
]
class
InfiniopTestCase
:
class
InfiniopTestCase
:
op_name
:
str
op_name
:
str
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment