Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
65df17f7
Unverified
Commit
65df17f7
authored
Mar 26, 2025
by
PanZezhong1725
Committed by
GitHub
Mar 26, 2025
Browse files
Merge pull request #47 from YdrMaster/swiglu-cpu
issue/46: swiglu 算子 - CPU
parents
9daba5b8
ee68b55d
Changes
13
Show whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
362 additions
and
36 deletions
+362
-36
.github/workflows/build.yml
.github/workflows/build.yml
+1
-1
src/infiniop/binary/binary.h
src/infiniop/binary/binary.h
+98
-0
src/infiniop/binary/cpu/binary_cpu.h
src/infiniop/binary/cpu/binary_cpu.h
+57
-0
src/infiniop/devices/cpu/common_cpu.cc
src/infiniop/devices/cpu/common_cpu.cc
+5
-1
src/infiniop/devices/cpu/common_cpu.h
src/infiniop/devices/cpu/common_cpu.h
+4
-0
src/infiniop/ops/swiglu/cpu/swiglu_cpu.cc
src/infiniop/ops/swiglu/cpu/swiglu_cpu.cc
+61
-0
src/infiniop/ops/swiglu/cpu/swiglu_cpu.h
src/infiniop/ops/swiglu/cpu/swiglu_cpu.h
+22
-0
src/infiniop/ops/swiglu/operator.cc
src/infiniop/ops/swiglu/operator.cc
+61
-19
src/infiniop/tensor.h
src/infiniop/tensor.h
+4
-0
src/infiniop/tensor_descriptor.cc
src/infiniop/tensor_descriptor.cc
+19
-0
src/utils/check.h
src/utils/check.h
+10
-0
test/infiniop/swiglu.py
test/infiniop/swiglu.py
+15
-15
xmake/cpu.lua
xmake/cpu.lua
+5
-0
No files found.
.github/workflows/build.yml
View file @
65df17f7
...
@@ -32,7 +32,7 @@ jobs:
...
@@ -32,7 +32,7 @@ jobs:
xmake-version
:
latest
xmake-version
:
latest
-
name
:
configure xmake
-
name
:
configure xmake
run
:
xmake f -cv
run
:
xmake f
--omp=y
-cv
-
name
:
build with xmake
-
name
:
build with xmake
run
:
xmake build
run
:
xmake build
...
...
src/infiniop/binary/binary.h
0 → 100644
View file @
65df17f7
#ifndef __INFINIOP_BINARY_H__
#define __INFINIOP_BINARY_H__
#include "../operator.h"
#include "../tensor.h"
#include <numeric>
/**
* 该类的设计基于 matmul.h 中 YdrMaster 设计的 DESCRIPTOR 宏。
*/
#define BINARY_DESCRIPTOR(OP, NAMESPACE) \
\
namespace op::OP::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
infiniDtype_t _dtype; \
op::binary::BinaryInfo _info; \
\
Descriptor( \
infiniDtype_t dtype, \
op::binary::BinaryInfo info, \
Opaque *opaque, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_dtype(dtype), \
_info(info) {} \
\
public: \
~Descriptor(); \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t c_desc, \
infiniopTensorDescriptor_t a_desc, \
infiniopTensorDescriptor_t b_desc); \
\
infiniStatus_t calculate( \
void *c, \
const void *a, \
const void *b, \
void *stream) const; \
}; \
}
namespace
op
::
binary
{
// Stores metadata for binary operations on CPU
struct
BinaryInfo
{
size_t
c_data_size
;
size_t
ndim
;
bool
contiguous
;
bool
broadcasted
;
std
::
vector
<
size_t
>
c_shape
;
std
::
vector
<
size_t
>
a_shape
;
std
::
vector
<
size_t
>
b_shape
;
std
::
vector
<
ptrdiff_t
>
c_strides
;
std
::
vector
<
ptrdiff_t
>
a_strides
;
std
::
vector
<
ptrdiff_t
>
b_strides
;
};
inline
infiniStatus_t
createBinaryInfo
(
BinaryInfo
&
info
,
infiniopTensorDescriptor_t
c_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
)
{
if
(
!
c_desc
||
!
a_desc
||
!
b_desc
)
{
return
INFINI_STATUS_BAD_PARAM
;
}
info
.
c_data_size
=
c_desc
->
numel
();
info
.
ndim
=
c_desc
->
ndim
();
info
.
contiguous
=
c_desc
->
isContiguous
()
&&
a_desc
->
isContiguous
()
&&
b_desc
->
isContiguous
();
// Destination cannot have broadcast setup
if
(
c_desc
->
hasBroadcastDim
())
{
return
INFINI_STATUS_BAD_TENSOR_STRIDES
;
}
const
bool
ndim_match
=
(
c_desc
->
ndim
()
==
a_desc
->
ndim
())
&&
(
c_desc
->
ndim
()
==
b_desc
->
ndim
());
info
.
broadcasted
=
!
info
.
contiguous
&&
(
!
ndim_match
||
a_desc
->
hasBroadcastDim
()
||
b_desc
->
hasBroadcastDim
());
info
.
c_shape
=
std
::
move
(
c_desc
->
shape
());
info
.
a_shape
=
std
::
move
(
a_desc
->
shape
());
info
.
b_shape
=
std
::
move
(
b_desc
->
shape
());
info
.
c_strides
=
std
::
move
(
c_desc
->
strides
());
info
.
a_strides
=
std
::
move
(
a_desc
->
strides
());
info
.
b_strides
=
std
::
move
(
b_desc
->
strides
());
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::binary
#endif // __INFINIOP_BINARY_H__
src/infiniop/binary/cpu/binary_cpu.h
0 → 100644
View file @
65df17f7
#ifndef __INFINIOP_BINARY_CPU_H__
#define __INFINIOP_BINARY_CPU_H__
#include "../../devices/cpu/common_cpu.h"
#include "../binary.h"
#include <utility>
namespace
op
::
common_cpu
{
namespace
binary_op
{
// Perform binary computation when inputs and the output can have different dtypes
template
<
typename
Tc
,
typename
Ta
,
typename
Tb
,
typename
BinaryOp
,
typename
...
Args
>
void
calculate
(
op
::
binary
::
BinaryInfo
info
,
void
*
c
,
const
void
*
a
,
const
void
*
b
,
Args
&&
...
args
)
{
auto
a_
=
reinterpret_cast
<
const
Ta
*>
(
a
);
auto
b_
=
reinterpret_cast
<
const
Tb
*>
(
b
);
auto
c_
=
reinterpret_cast
<
Tc
*>
(
c
);
ptrdiff_t
data_size
=
info
.
c_data_size
;
#pragma omp parallel for
for
(
ptrdiff_t
i
=
0
;
i
<
data_size
;
++
i
)
{
size_t
a_index
=
info
.
contiguous
?
i
:
(
info
.
broadcasted
?
op
::
common_cpu
::
indexToReducedOffset
(
i
,
info
.
ndim
,
info
.
c_strides
.
data
(),
info
.
a_strides
.
data
())
:
op
::
common_cpu
::
indexToOffset
(
i
,
info
.
ndim
,
info
.
a_shape
.
data
(),
info
.
a_strides
.
data
()));
size_t
b_index
=
info
.
contiguous
?
i
:
(
info
.
broadcasted
?
op
::
common_cpu
::
indexToReducedOffset
(
i
,
info
.
ndim
,
info
.
c_strides
.
data
(),
info
.
b_strides
.
data
())
:
op
::
common_cpu
::
indexToOffset
(
i
,
info
.
ndim
,
info
.
b_shape
.
data
(),
info
.
b_strides
.
data
()));
size_t
c_index
=
info
.
contiguous
?
i
:
(
op
::
common_cpu
::
indexToOffset
(
i
,
info
.
ndim
,
info
.
c_shape
.
data
(),
info
.
c_strides
.
data
()));
c_
[
c_index
]
=
BinaryOp
{}(
a_
[
a_index
],
b_
[
b_index
],
std
::
forward
<
Args
>
(
args
)...);
}
}
// Perform binary computation when all inputs and the output share the same dtype
template
<
typename
Tdata
,
typename
BinaryOp
,
typename
...
Args
>
void
calculate
(
op
::
binary
::
BinaryInfo
info
,
void
*
c
,
const
void
*
a
,
const
void
*
b
,
Args
&&
...
args
)
{
auto
a_
=
reinterpret_cast
<
const
Tdata
*>
(
a
);
auto
b_
=
reinterpret_cast
<
const
Tdata
*>
(
b
);
auto
c_
=
reinterpret_cast
<
Tdata
*>
(
c
);
ptrdiff_t
data_size
=
info
.
c_data_size
;
#pragma omp parallel for
for
(
ptrdiff_t
i
=
0
;
i
<
data_size
;
++
i
)
{
size_t
a_index
=
info
.
contiguous
?
i
:
(
info
.
broadcasted
?
op
::
common_cpu
::
indexToReducedOffset
(
i
,
info
.
ndim
,
info
.
c_strides
.
data
(),
info
.
a_strides
.
data
())
:
op
::
common_cpu
::
indexToOffset
(
i
,
info
.
ndim
,
info
.
a_shape
.
data
(),
info
.
a_strides
.
data
()));
size_t
b_index
=
info
.
contiguous
?
i
:
(
info
.
broadcasted
?
op
::
common_cpu
::
indexToReducedOffset
(
i
,
info
.
ndim
,
info
.
c_strides
.
data
(),
info
.
b_strides
.
data
())
:
op
::
common_cpu
::
indexToOffset
(
i
,
info
.
ndim
,
info
.
b_shape
.
data
(),
info
.
b_strides
.
data
()));
size_t
c_index
=
info
.
contiguous
?
i
:
(
op
::
common_cpu
::
indexToOffset
(
i
,
info
.
ndim
,
info
.
c_shape
.
data
(),
info
.
c_strides
.
data
()));
if
constexpr
(
std
::
is_same_v
<
Tdata
,
fp16_t
>
)
{
float
a_val
=
utils
::
cast
<
float
>
(
a_
[
a_index
]);
float
b_val
=
utils
::
cast
<
float
>
(
b_
[
b_index
]);
c_
[
c_index
]
=
utils
::
cast
<
fp16_t
>
(
BinaryOp
{}(
a_val
,
b_val
,
std
::
forward
<
Args
>
(
args
)...));
}
else
{
c_
[
c_index
]
=
BinaryOp
{}(
a_
[
a_index
],
b_
[
b_index
],
std
::
forward
<
Args
>
(
args
)...);
}
}
}
}
// namespace binary_op
}
// namespace op::common_cpu
#endif // __INFINIOP_BINARY_CPU_H__
src/infiniop/devices/cpu/common_cpu.cc
View file @
65df17f7
#include "common_cpu.h"
#include "common_cpu.h"
namespace
op
::
common_cpu
{
size_t
indexToReducedOffset
(
size_t
indexToReducedOffset
(
size_t
flat_index
,
size_t
flat_index
,
size_t
ndim
,
size_t
ndim
,
...
@@ -19,7 +21,7 @@ size_t indexToOffset(
...
@@ -19,7 +21,7 @@ size_t indexToOffset(
const
size_t
*
shape
,
const
size_t
*
shape
,
const
ptrdiff_t
*
strides
)
{
const
ptrdiff_t
*
strides
)
{
size_t
res
=
0
;
size_t
res
=
0
;
for
(
size_t
i
=
ndim
;
i
--
>
=
0
;)
{
for
(
size_t
i
=
ndim
;
i
--
>
0
;)
{
res
+=
(
flat_index
%
shape
[
i
])
*
strides
[
i
];
res
+=
(
flat_index
%
shape
[
i
])
*
strides
[
i
];
flat_index
/=
shape
[
i
];
flat_index
/=
shape
[
i
];
}
}
...
@@ -48,3 +50,5 @@ std::vector<size_t> getPaddedShape(
...
@@ -48,3 +50,5 @@ std::vector<size_t> getPaddedShape(
}
}
return
padded_shape
;
return
padded_shape
;
}
}
}
// namespace op::common_cpu
src/infiniop/devices/cpu/common_cpu.h
View file @
65df17f7
...
@@ -13,6 +13,8 @@
...
@@ -13,6 +13,8 @@
#include <omp.h>
#include <omp.h>
#endif
#endif
namespace
op
::
common_cpu
{
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
size_t
indexToReducedOffset
(
size_t
flat_index
,
size_t
ndim
,
const
ptrdiff_t
*
broadcasted_strides
,
const
ptrdiff_t
*
target_strides
);
size_t
indexToReducedOffset
(
size_t
flat_index
,
size_t
ndim
,
const
ptrdiff_t
*
broadcasted_strides
,
const
ptrdiff_t
*
target_strides
);
...
@@ -28,4 +30,6 @@ size_t getPaddedSize(size_t ndim, size_t *shape, const size_t *pads);
...
@@ -28,4 +30,6 @@ size_t getPaddedSize(size_t ndim, size_t *shape, const size_t *pads);
// calculate the padded shape and store the result in padded_shape
// calculate the padded shape and store the result in padded_shape
std
::
vector
<
size_t
>
getPaddedShape
(
size_t
ndim
,
const
size_t
*
shape
,
const
size_t
*
pads
);
std
::
vector
<
size_t
>
getPaddedShape
(
size_t
ndim
,
const
size_t
*
shape
,
const
size_t
*
pads
);
}
// namespace op::common_cpu
#endif // __INFINIOP__COMMON_CPU_H__
#endif // __INFINIOP__COMMON_CPU_H__
src/infiniop/ops/swiglu/cpu/swiglu_cpu.cc
0 → 100644
View file @
65df17f7
#include "swiglu_cpu.h"
namespace
op
::
swiglu
::
cpu
{
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
infiniopTensorDescriptor_t
up_desc
,
infiniopTensorDescriptor_t
gate_desc
)
{
auto
handle
=
reinterpret_cast
<
device
::
cpu
::
Handle
*>
(
handle_
);
auto
dtype
=
out_desc
->
dtype
();
const
auto
&
out_shape
=
out_desc
->
shape
();
const
auto
&
up_shape
=
up_desc
->
shape
();
const
auto
&
gate_shape
=
gate_desc
->
shape
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
);
if
(
!
SAME_VEC
(
out_shape
,
up_shape
,
gate_shape
))
{
return
INFINI_STATUS_BAD_TENSOR_SHAPE
;
}
op
::
binary
::
BinaryInfo
info
;
CHECK_STATUS
(
op
::
binary
::
createBinaryInfo
(
info
,
out_desc
,
up_desc
,
gate_desc
));
// Create descriptor
*
desc_ptr
=
new
Descriptor
(
dtype
,
std
::
move
(
info
),
nullptr
,
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
c
,
const
void
*
a
,
const
void
*
b
,
void
*
stream
)
const
{
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
op
::
common_cpu
::
binary_op
::
calculate
<
fp16_t
,
SwiGLUOp
>
(
_info
,
c
,
a
,
b
);
break
;
case
INFINI_DTYPE_F32
:
op
::
common_cpu
::
binary_op
::
calculate
<
float
,
SwiGLUOp
>
(
_info
,
c
,
a
,
b
);
break
;
case
INFINI_DTYPE_F64
:
op
::
common_cpu
::
binary_op
::
calculate
<
double
,
SwiGLUOp
>
(
_info
,
c
,
a
,
b
);
break
;
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::swiglu::cpu
src/infiniop/ops/swiglu/cpu/swiglu_cpu.h
0 → 100644
View file @
65df17f7
#ifndef __SWIGLU_CPU_H__
#define __SWIGLU_CPU_H__
#include "../../../binary/cpu/binary_cpu.h"
BINARY_DESCRIPTOR
(
swiglu
,
cpu
)
struct
SwiGLUOp
{
private:
template
<
typename
T
>
T
sigmoid
(
const
T
&
x
)
const
{
return
1
/
(
1
+
std
::
exp
(
-
x
));
}
public:
template
<
typename
T
>
T
operator
()(
const
T
&
up
,
const
T
&
gate
)
const
{
return
gate
*
sigmoid
(
gate
)
*
up
;
}
};
#endif // __SWIGLU_CPU_H__
src/infiniop/ops/swiglu/operator.cc
View file @
65df17f7
...
@@ -2,15 +2,30 @@
...
@@ -2,15 +2,30 @@
#include "../../handle.h"
#include "../../handle.h"
#include "infiniop/ops/swiglu.h"
#include "infiniop/ops/swiglu.h"
#ifdef ENABLE_CPU_API
#include "cpu/swiglu_cpu.h"
#endif
__C
infiniStatus_t
infiniopCreateSwiGLUDescriptor
(
__C
infiniStatus_t
infiniopCreateSwiGLUDescriptor
(
infiniopHandle_t
handle
,
infiniopSwiGLUDescriptor_t
*
desc_ptr
,
infiniopHandle_t
handle
,
infiniopTensorDescriptor_t
c_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopSwiGLUDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
)
{
infiniopTensorDescriptor_t
b_desc
)
{
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::swiglu::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::swiglu::NAMESPACE::Descriptor **>(desc_ptr), \
c_desc, \
a_desc, \
b_desc)
switch
(
handle
->
device
)
{
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU
case
DevCpu
:
#ifdef ENABLE_CPU_API
return
cpuCreateSwiGLUDescriptor
(
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
handle
,
(
SwiGLUCpuDescriptor_t
*
)
desc_ptr
,
c_desc
,
a_desc
,
b_desc
);
#endif
#endif
#ifdef ENABLE_NV_GPU
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
case
DevNvGpu
:
...
@@ -43,17 +58,30 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
...
@@ -43,17 +58,30 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
return
musaCreateSwiGLUDescriptor
(
return
musaCreateSwiGLUDescriptor
(
handle
,
(
SwiGLUMusaDescriptor_t
*
)
desc_ptr
,
c_desc
,
a_desc
,
b_desc
);
handle
,
(
SwiGLUMusaDescriptor_t
*
)
desc_ptr
,
c_desc
,
a_desc
,
b_desc
);
#endif
#endif
}
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
};
}
#undef CREATE
}
__C
infiniStatus_t
infiniopSwiGLU
(
infiniopSwiGLUDescriptor_t
desc
,
void
*
c
,
__C
infiniStatus_t
infiniopSwiGLU
(
const
void
*
a
,
const
void
*
b
,
infiniopSwiGLUDescriptor_t
desc
,
void
*
c
,
const
void
*
a
,
const
void
*
b
,
void
*
stream
)
{
void
*
stream
)
{
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::swiglu::NAMESPACE::Descriptor *>(desc) \
->calculate(c, a, b, stream)
switch
(
desc
->
device_type
)
{
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU
case
DevCpu
:
#ifdef ENABLE_CPU_API
return
cpuSwiGLU
((
SwiGLUCpuDescriptor_t
)
desc
,
c
,
a
,
b
,
stream
);
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#endif
#ifdef ENABLE_NV_GPU
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
case
DevNvGpu
:
...
@@ -76,16 +104,26 @@ __C infiniStatus_t infiniopSwiGLU(infiniopSwiGLUDescriptor_t desc, void *c,
...
@@ -76,16 +104,26 @@ __C infiniStatus_t infiniopSwiGLU(infiniopSwiGLUDescriptor_t desc, void *c,
case
DevMthreadsGpu
:
case
DevMthreadsGpu
:
return
musaSwiGLU
((
SwiGLUMusaDescriptor_t
)
desc
,
c
,
a
,
b
,
stream
);
return
musaSwiGLU
((
SwiGLUMusaDescriptor_t
)
desc
,
c
,
a
,
b
,
stream
);
#endif
#endif
}
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CALCULATE
}
}
__C
infiniStatus_t
__C
infiniStatus_t
infiniopDestroySwiGLUDescriptor
(
infiniopSwiGLUDescriptor_t
desc
)
{
infiniopDestroySwiGLUDescriptor
(
infiniopSwiGLUDescriptor_t
desc
)
{
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::swiglu::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU
case
DevCpu
:
#ifdef ENABLE_CPU_API
return
cpuDestroySwiGLUDescriptor
((
SwiGLUCpuDescriptor_t
)
desc
);
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#endif
#ifdef ENABLE_NV_GPU
#ifdef ENABLE_NV_GPU
case
DevNvGpu
:
case
DevNvGpu
:
...
@@ -108,6 +146,10 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
...
@@ -108,6 +146,10 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
case
DevMthreadsGpu
:
case
DevMthreadsGpu
:
return
musaDestroySwiGLUDescriptor
((
SwiGLUMusaDescriptor_t
)
desc
);
return
musaDestroySwiGLUDescriptor
((
SwiGLUMusaDescriptor_t
)
desc
);
#endif
#endif
}
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef DELETE
}
}
src/infiniop/tensor.h
View file @
65df17f7
...
@@ -28,6 +28,10 @@ public:
...
@@ -28,6 +28,10 @@ public:
bool
isContiguous
()
const
;
bool
isContiguous
()
const
;
size_t
numel
()
const
;
size_t
numel
()
const
;
// a dim is broadcasted if it's corresponding stride is 0 but dim > 1
bool
hasBroadcastDim
()
const
;
std
::
vector
<
size_t
>
getBroadcastDim
()
const
;
infiniopTensorDescriptor_t
dimMerge
(
size_t
dim_start
,
size_t
dim_end
)
const
;
infiniopTensorDescriptor_t
dimMerge
(
size_t
dim_start
,
size_t
dim_end
)
const
;
infiniopTensorDescriptor_t
dimSplit
(
size_t
axis
,
const
std
::
vector
<
size_t
>
&
dims
)
const
;
infiniopTensorDescriptor_t
dimSplit
(
size_t
axis
,
const
std
::
vector
<
size_t
>
&
dims
)
const
;
infiniopTensorDescriptor_t
dimPermute
(
const
std
::
vector
<
size_t
>
&
order
)
const
;
infiniopTensorDescriptor_t
dimPermute
(
const
std
::
vector
<
size_t
>
&
order
)
const
;
...
...
src/infiniop/tensor_descriptor.cc
View file @
65df17f7
#include "../utils.h"
#include "../utils.h"
#include "tensor.h"
#include "tensor.h"
#include <algorithm>
#include <cstring>
#include <cstring>
#include <functional>
#include <functional>
#include <numeric>
#include <numeric>
...
@@ -85,6 +86,24 @@ bool InfiniopTensorDescriptor::isContiguous() const {
...
@@ -85,6 +86,24 @@ bool InfiniopTensorDescriptor::isContiguous() const {
return
isContiguous
(
0
,
ndim
()
-
1
);
return
isContiguous
(
0
,
ndim
()
-
1
);
}
}
bool
InfiniopTensorDescriptor
::
hasBroadcastDim
()
const
{
return
std
::
any_of
(
_shape
.
begin
(),
_shape
.
end
(),
[
&
,
i
=
0
](
const
auto
&
)
mutable
{
return
_shape
[
i
]
!=
1
&&
_strides
[
i
++
]
==
0
;
});
}
std
::
vector
<
size_t
>
InfiniopTensorDescriptor
::
getBroadcastDim
()
const
{
std
::
vector
<
size_t
>
res
;
for
(
size_t
i
=
0
;
i
<
ndim
();
++
i
)
{
if
(
_shape
[
i
]
!=
1
&&
_strides
[
i
]
==
0
)
{
res
.
push_back
(
i
);
}
}
return
res
;
}
infiniopTensorDescriptor_t
InfiniopTensorDescriptor
::
dimMerge
(
size_t
dim_start
,
size_t
dim_end
)
const
{
infiniopTensorDescriptor_t
InfiniopTensorDescriptor
::
dimMerge
(
size_t
dim_start
,
size_t
dim_end
)
const
{
if
(
dim_start
>
dim_end
||
dim_end
>=
ndim
())
{
if
(
dim_start
>
dim_end
||
dim_end
>=
ndim
())
{
return
nullptr
;
return
nullptr
;
...
...
src/utils/check.h
View file @
65df17f7
#ifndef INFINIUTILS_CHECK_H
#ifndef INFINIUTILS_CHECK_H
#define INFINIUTILS_CHECK_H
#define INFINIUTILS_CHECK_H
#include <iostream>
#include <iostream>
#include <tuple>
#define CHECK_API_OR(API, EXPECT, ACTION) \
#define CHECK_API_OR(API, EXPECT, ACTION) \
do { \
do { \
...
@@ -30,4 +31,13 @@
...
@@ -30,4 +31,13 @@
return INFINI_STATUS_BAD_TENSOR_DTYPE); \
return INFINI_STATUS_BAD_TENSOR_DTYPE); \
} while (0)
} while (0)
#define SAME_VEC(...) \
[&] { \
auto &&_vec = std::forward_as_tuple(__VA_ARGS__); \
const auto &_base = std::get<0>(_vec); \
return [&_base](auto &&...args) { \
return ((args == _base) && ...); \
}(__VA_ARGS__); \
}()
#endif // INFINIUTILS_CHECK_H
#endif // INFINIUTILS_CHECK_H
test/infiniop/swiglu.py
View file @
65df17f7
...
@@ -25,19 +25,25 @@ _TEST_CASES_ = [
...
@@ -25,19 +25,25 @@ _TEST_CASES_ = [
# shape, a_stride, b_stride, c_stride
# shape, a_stride, b_stride, c_stride
((
13
,
4
),
None
,
None
,
None
),
((
13
,
4
),
None
,
None
,
None
),
((
13
,
4
),
(
10
,
1
),
(
10
,
1
),
(
10
,
1
)),
((
13
,
4
),
(
10
,
1
),
(
10
,
1
),
(
10
,
1
)),
#
((13, 4, 4), None, None, None),
((
13
,
4
,
4
),
None
,
None
,
None
),
#
((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
((
13
,
4
,
4
),
(
20
,
4
,
1
),
(
20
,
4
,
1
),
(
20
,
4
,
1
)),
((
16
,
5632
),
None
,
None
,
None
),
((
16
,
5632
),
None
,
None
,
None
),
((
16
,
5632
),
(
13312
,
1
),
(
13312
,
1
),
(
13312
,
1
)),
((
16
,
5632
),
(
13312
,
1
),
(
13312
,
1
),
(
13312
,
1
)),
#
((4, 4, 5632), None, None, None),
((
4
,
4
,
5632
),
None
,
None
,
None
),
#
((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
((
4
,
4
,
5632
),
(
45056
,
5632
,
1
),
(
45056
,
5632
,
1
),
(
45056
,
5632
,
1
)),
]
]
class
Inplace
(
Enum
):
OUT_OF_PLACE
=
auto
()
INPLACE_A
=
auto
()
INPLACE_B
=
auto
()
# Inplace options applied for each test case in _TEST_CASES_
# Inplace options applied for each test case in _TEST_CASES_
_INPLACE
=
[
_INPLACE
=
[
"
Inplace.OUT_OF_PLACE
"
,
Inplace
.
OUT_OF_PLACE
,
"
Inplace.INPLACE_A
"
,
Inplace
.
INPLACE_A
,
"
Inplace.INPLACE_B
"
,
Inplace
.
INPLACE_B
,
]
]
# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
...
@@ -48,7 +54,7 @@ _TEST_CASES = [
...
@@ -48,7 +54,7 @@ _TEST_CASES = [
]
]
# Data types used for testing
# Data types used for testing
_TENSOR_DTYPES
=
[
torch
.
float16
]
_TENSOR_DTYPES
=
[
torch
.
float16
,
torch
.
float32
]
# Tolerance map for different data types
# Tolerance map for different data types
_TOLERANCE_MAP
=
{
_TOLERANCE_MAP
=
{
...
@@ -61,12 +67,6 @@ NUM_PRERUN = 10
...
@@ -61,12 +67,6 @@ NUM_PRERUN = 10
NUM_ITERATIONS
=
1000
NUM_ITERATIONS
=
1000
class
Inplace
(
Enum
):
OUT_OF_PLACE
=
auto
()
INPLACE_A
=
auto
()
INPLACE_B
=
auto
()
class
SwiGLUDescriptor
(
Structure
):
class
SwiGLUDescriptor
(
Structure
):
_fields_
=
[(
"device"
,
c_int32
)]
_fields_
=
[(
"device"
,
c_int32
)]
...
@@ -132,7 +132,7 @@ def test(
...
@@ -132,7 +132,7 @@ def test(
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for
tensor
in
[
a_tensor
,
b_tensor
,
c_tensor
]:
for
tensor
in
[
a_tensor
,
b_tensor
,
c_tensor
]:
tensor
.
des
criptor
.
contents
.
invalidate
(
)
tensor
.
des
troyDesc
(
lib
)
def
lib_swiglu
():
def
lib_swiglu
():
check_error
(
check_error
(
...
...
xmake/cpu.lua
View file @
65df17f7
...
@@ -37,3 +37,8 @@ target("infinirt-cpu")
...
@@ -37,3 +37,8 @@ target("infinirt-cpu")
set_languages
(
"cxx17"
)
set_languages
(
"cxx17"
)
add_files
(
"../src/infinirt/cpu/*.cc"
)
add_files
(
"../src/infinirt/cpu/*.cc"
)
target_end
()
target_end
()
if
has_config
(
"omp"
)
then
add_requires
(
"openmp"
)
add_packages
(
"openmp"
)
end
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment