Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
54635d9f
Commit
54635d9f
authored
Jan 22, 2026
by
zhushuang
Browse files
issue/949 - feat: add silu_and_mul for moore gpu with test pass
parent
eb89439d
Changes
18
Show whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
866 additions
and
0 deletions
+866
-0
include/infinicore/ops.hpp
include/infinicore/ops.hpp
+1
-0
include/infinicore/ops/silu_and_mul.hpp
include/infinicore/ops/silu_and_mul.hpp
+14
-0
include/infiniop.h
include/infiniop.h
+1
-0
include/infiniop/ops/silu_and_mul.h
include/infiniop/ops/silu_and_mul.h
+71
-0
python/infinicore/nn/functional/__init__.py
python/infinicore/nn/functional/__init__.py
+2
-0
python/infinicore/nn/functional/silu_and_mul.py
python/infinicore/nn/functional/silu_and_mul.py
+17
-0
src/infinicore/ops/silu_and_mul/silu_and_mul.cc
src/infinicore/ops/silu_and_mul/silu_and_mul.cc
+35
-0
src/infinicore/ops/silu_and_mul/silu_and_mul_infiniop.cc
src/infinicore/ops/silu_and_mul/silu_and_mul_infiniop.cc
+50
-0
src/infinicore/pybind11/ops.hpp
src/infinicore/pybind11/ops.hpp
+2
-0
src/infinicore/pybind11/ops/silu_and_mul.hpp
src/infinicore/pybind11/ops/silu_and_mul.hpp
+29
-0
src/infiniop/ops/silu_and_mul/info.h
src/infiniop/ops/silu_and_mul/info.h
+54
-0
src/infiniop/ops/silu_and_mul/moore/silu_and_mul_moore.h
src/infiniop/ops/silu_and_mul/moore/silu_and_mul_moore.h
+8
-0
src/infiniop/ops/silu_and_mul/moore/silu_and_mul_moore.mu
src/infiniop/ops/silu_and_mul/moore/silu_and_mul_moore.mu
+123
-0
src/infiniop/ops/silu_and_mul/operator.cc
src/infiniop/ops/silu_and_mul/operator.cc
+79
-0
src/infiniop/ops/silu_and_mul/silu_and_mul.h
src/infiniop/ops/silu_and_mul/silu_and_mul.h
+46
-0
test/infinicore/ops/silu_and_mul.py
test/infinicore/ops/silu_and_mul.py
+126
-0
test/infiniop/libinfiniop/op_register.py
test/infiniop/libinfiniop/op_register.py
+32
-0
test/infiniop/silu_and_mul.py
test/infiniop/silu_and_mul.py
+176
-0
No files found.
include/infinicore/ops.hpp
View file @
54635d9f
...
...
@@ -17,4 +17,5 @@
#include "ops/rms_norm.hpp"
#include "ops/rope.hpp"
#include "ops/silu.hpp"
#include "ops/silu_and_mul.hpp"
#include "ops/swiglu.hpp"
include/infinicore/ops/silu_and_mul.hpp
0 → 100644
View file @
54635d9f
#pragma once
#include "../device.hpp"
#include "../graph/graph.hpp"
#include "common/op.hpp"
namespace
infinicore
::
op
{
INFINICORE_GRAPH_OP_CLASS
(
SiluAndMul
,
Tensor
,
const
Tensor
&
);
Tensor
silu_and_mul
(
const
Tensor
&
x
);
void
silu_and_mul_
(
Tensor
out
,
const
Tensor
&
x
);
}
// namespace infinicore::op
include/infiniop.h
View file @
54635d9f
...
...
@@ -31,6 +31,7 @@
#include "infiniop/ops/rope.h"
#include "infiniop/ops/sigmoid.h"
#include "infiniop/ops/silu.h"
#include "infiniop/ops/silu_and_mul.h"
#include "infiniop/ops/softmax.h"
#include "infiniop/ops/softplus.h"
#include "infiniop/ops/sub.h"
...
...
include/infiniop/ops/silu_and_mul.h
0 → 100644
View file @
54635d9f
#ifndef __INFINIOP_SILU_AND_MUL_API_H__
#define __INFINIOP_SILU_AND_MUL_API_H__
#include "../operator_descriptor.h"
/**
* @brief Opaque handle for the SiluAndMul descriptor.
*/
typedef
struct
InfiniopDescriptor
*
infiniopSiluAndMulDescriptor_t
;
/**
* @brief Creates a descriptor for the SiLU and Multiply (SiluAndMul) operation.
*
* Format: (input_shape, output_shape)
* Referencing vLLM kernel SiluAndMul interface:
* - input_shape is [..., 2*d] (last dimension is split into two halves for SiLU and multiplication)
* - output_shape is [..., d] (last dimension reduced to half)
*
* @param handle The handle to the InfiniOP library context.
* @param desc_ptr A pointer to store the created descriptor.
* @param output Descriptor for the output tensor. Shape [..., d].
* @param input Descriptor for the input tensor. Shape [..., 2*d].
* @return infiniStatus_t Status code of the operation.
*/
__C
__export
infiniStatus_t
infiniopCreateSiluAndMulDescriptor
(
infiniopHandle_t
handle
,
infiniopSiluAndMulDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
output
,
infiniopTensorDescriptor_t
input
);
/**
* @brief Queries the workspace size required for SiluAndMul computation.
* @param desc The SiluAndMul descriptor.
* @param size Pointer to store the required workspace size in bytes.
* @return infiniStatus_t Status code of the operation.
*/
__C
__export
infiniStatus_t
infiniopGetSiluAndMulWorkspaceSize
(
infiniopSiluAndMulDescriptor_t
desc
,
size_t
*
size
);
/**
* @brief Executes the SiluAndMul operation.
*
* Performs SiLU activation on the first half of the last dimension of `input`,
* multiplies element-wise with the second half, and stores the result in `output`.
*
* @param desc The SiluAndMul descriptor.
* @param workspace Pointer to workspace memory allocated according to GetWorkspaceSize().
* @param workspace_size Size of the workspace in bytes.
* @param output Pointer to the output tensor memory. Shape [..., d].
* @param input Pointer to the input tensor memory. Shape [..., 2*d].
* @param stream Pointer to the execution stream (e.g., CUDA stream). Can be NULL for default stream.
* @return infiniStatus_t Status code of the operation.
*/
__C
__export
infiniStatus_t
infiniopSiluAndMul
(
infiniopSiluAndMulDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
const
void
*
input
,
void
*
stream
);
/**
* @brief Destroys a previously created SiluAndMul descriptor.
* @param desc The descriptor to destroy.
* @return infiniStatus_t Status code of the operation.
*/
__C
__export
infiniStatus_t
infiniopDestroySiluAndMulDescriptor
(
infiniopSiluAndMulDescriptor_t
desc
);
#endif // __INFINIOP_SILU_AND_MUL_API_H__
python/infinicore/nn/functional/__init__.py
View file @
54635d9f
...
...
@@ -7,6 +7,7 @@ from .random_sample import random_sample
from
.rms_norm
import
rms_norm
from
.rope
import
RopeAlgo
,
rope
from
.silu
import
silu
from
.silu_and_mul
import
silu_and_mul
from
.swiglu
import
swiglu
__all__
=
[
...
...
@@ -21,4 +22,5 @@ __all__ = [
"silu"
,
"swiglu"
,
"linear_w8a8i8"
,
"silu_and_mul"
,
]
python/infinicore/nn/functional/silu_and_mul.py
0 → 100644
View file @
54635d9f
from
infinicore.lib
import
_infinicore
from
infinicore.tensor
import
Tensor
def
silu_and_mul
(
input
:
Tensor
,
out
=
None
)
->
Tensor
:
r
"""Apply the SiLU and Mul (SwiGLU) function.
Formula: output = SiLU(input_gate) * input_up
Input shape: [..., 2*d], Output shape: [..., d]
"""
if
out
is
None
:
return
Tensor
(
_infinicore
.
silu_and_mul
(
input
.
_underlying
))
_infinicore
.
silu_and_mul_
(
out
.
_underlying
,
input
.
_underlying
)
return
out
src/infinicore/ops/silu_and_mul/silu_and_mul.cc
0 → 100644
View file @
54635d9f
#include "infinicore/ops/silu_and_mul.hpp"
#include "../../utils.hpp"
namespace
infinicore
::
op
{
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL
(
SiluAndMul
);
SiluAndMul
::
SiluAndMul
(
Tensor
out
,
const
Tensor
&
x
)
{
INFINICORE_ASSERT_TENSORS_SAME_DEVICE
(
out
,
x
);
INFINICORE_GRAPH_OP_DISPATCH
(
out
->
device
().
getType
(),
out
,
x
);
}
void
SiluAndMul
::
execute
(
Tensor
out
,
const
Tensor
&
x
)
{
INFINICORE_GRAPH_OP_RECORD_OR_RUN
(
SiluAndMul
,
out
,
x
);
}
Tensor
silu_and_mul
(
const
Tensor
&
x
)
{
Shape
shape
=
x
->
shape
();
size_t
ndim
=
x
->
ndim
();
if
(
shape
[
ndim
-
1
]
%
2
!=
0
)
{
throw
std
::
runtime_error
(
"SiluAndMul input last dim must be even."
);
}
shape
[
ndim
-
1
]
/=
2
;
auto
out
=
Tensor
::
empty
(
shape
,
x
->
dtype
(),
x
->
device
());
silu_and_mul_
(
out
,
x
);
return
out
;
}
void
silu_and_mul_
(
Tensor
out
,
const
Tensor
&
x
)
{
SiluAndMul
::
execute
(
out
,
x
);
}
}
// namespace infinicore::op
src/infinicore/ops/silu_and_mul/silu_and_mul_infiniop.cc
0 → 100644
View file @
54635d9f
#include "../infiniop_impl.hpp"
#include "infinicore/ops/silu_and_mul.hpp"
namespace
infinicore
::
op
::
silu_and_mul_impl
::
infiniop
{
INFINIOP_CACHABLE_DESCRIPTOR
(
Descriptor
,
SiluAndMul
,
100
);
struct
PlannedMeta
{
std
::
shared_ptr
<
Descriptor
>
descriptor
;
graph
::
GraphTensor
workspace
,
output
,
input
;
};
void
*
plan
(
Tensor
output
,
const
Tensor
&
input
)
{
size_t
seed
=
hash_combine
(
output
,
input
);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE
(
Descriptor
,
descriptor
,
SiluAndMul
,
seed
,
output
->
desc
(),
input
->
desc
());
INFINIOP_WORKSPACE_TENSOR
(
workspace
,
SiluAndMul
,
descriptor
);
auto
planned
=
new
PlannedMeta
{
descriptor
,
graph
::
GraphTensor
(
workspace
),
graph
::
GraphTensor
(
output
),
graph
::
GraphTensor
(
input
)};
return
planned
;
}
void
run
(
void
*
planned_meta
)
{
auto
planned
=
reinterpret_cast
<
PlannedMeta
*>
(
planned_meta
);
INFINICORE_CHECK_ERROR
(
infiniopSiluAndMul
(
planned
->
descriptor
->
desc
,
planned
->
workspace
->
data
(),
planned
->
workspace
->
numel
(),
planned
->
output
->
data
(),
planned
->
input
->
data
(),
context
::
getStream
()));
}
void
cleanup
(
void
**
planned_meta_ptr
)
{
delete
*
reinterpret_cast
<
PlannedMeta
**>
(
planned_meta_ptr
);
*
planned_meta_ptr
=
nullptr
;
}
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE
(
SiluAndMul
,
&
plan
,
&
run
,
&
cleanup
);
}
// namespace infinicore::op::silu_and_mul_impl::infiniop
src/infinicore/pybind11/ops.hpp
View file @
54635d9f
...
...
@@ -21,6 +21,7 @@
#include "ops/rms_norm.hpp"
#include "ops/rope.hpp"
#include "ops/silu.hpp"
#include "ops/silu_and_mul.hpp"
#include "ops/swiglu.hpp"
namespace
py
=
pybind11
;
...
...
@@ -48,6 +49,7 @@ inline void bind(py::module &m) {
bind_rope
(
m
);
bind_embedding
(
m
);
bind_linear_w8a8i8
(
m
);
bind_silu_and_mul
(
m
);
}
}
// namespace infinicore::ops
src/infinicore/pybind11/ops/silu_and_mul.hpp
0 → 100644
View file @
54635d9f
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/silu_and_mul.hpp"
namespace
py
=
pybind11
;
namespace
infinicore
::
ops
{
inline
void
bind_silu_and_mul
(
py
::
module
&
m
)
{
m
.
def
(
"silu_and_mul"
,
&
op
::
silu_and_mul
,
py
::
arg
(
"input"
),
R"doc(
SiLU and Mul (SwiGLU) activation function.
Input should be [..., 2*d], output will be [..., d].
)doc"
);
m
.
def
(
"silu_and_mul_"
,
&
op
::
silu_and_mul_
,
py
::
arg
(
"output"
),
py
::
arg
(
"input"
),
R"doc(
In-place or destination-specified SiLU and Mul (SwiGLU) activation function.
)doc"
);
}
}
// namespace infinicore::ops
src/infiniop/ops/silu_and_mul/info.h
0 → 100644
View file @
54635d9f
#ifndef __SILU_AND_MUL_INFO_H__
#define __SILU_AND_MUL_INFO_H__
#include "../../../utils.h"
#include "../../tensor.h"
#include <vector>
namespace
op
::
silu_and_mul
{
class
SiluAndMulInfo
{
SiluAndMulInfo
()
=
default
;
public:
infiniDtype_t
dtype
;
size_t
batch_size
;
size_t
out_hidden_dim
;
static
utils
::
Result
<
SiluAndMulInfo
>
create
(
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
)
{
auto
dtype
=
y_desc
->
dtype
();
auto
x_shape
=
x_desc
->
shape
();
auto
y_shape
=
y_desc
->
shape
();
auto
ndim
=
x_desc
->
ndim
();
if
(
ndim
!=
y_desc
->
ndim
())
{
return
INFINI_STATUS_BAD_PARAM
;
}
if
(
x_shape
[
ndim
-
1
]
!=
2
*
y_shape
[
ndim
-
1
])
{
return
INFINI_STATUS_BAD_TENSOR_SHAPE
;
}
size_t
batch
=
1
;
for
(
int
i
=
0
;
i
<
(
int
)
ndim
-
1
;
++
i
)
{
if
(
x_shape
[
i
]
!=
y_shape
[
i
])
{
return
INFINI_STATUS_BAD_TENSOR_SHAPE
;
}
batch
*=
y_shape
[
i
];
}
return
utils
::
Result
<
SiluAndMulInfo
>
(
SiluAndMulInfo
{
dtype
,
batch
,
y_shape
[
ndim
-
1
]});
}
private:
SiluAndMulInfo
(
infiniDtype_t
dtype
,
size_t
batch
,
size_t
hidden
)
:
dtype
(
dtype
),
batch_size
(
batch
),
out_hidden_dim
(
hidden
)
{}
};
}
// namespace op::silu_and_mul
#endif // __SILU_AND_MUL_INFO_H__
src/infiniop/ops/silu_and_mul/moore/silu_and_mul_moore.h
0 → 100644
View file @
54635d9f
#ifndef __SILU_ADN_MUL_MOORE_API_H__
#define __SILU_ADN_MUL_MOORE_API_H__
#include "../silu_and_mul.h"
DESCRIPTOR
(
moore
)
#endif // __SILU_ADN_MUL_MOORE_API_H__
src/infiniop/ops/silu_and_mul/moore/silu_and_mul_moore.mu
0 → 100644
View file @
54635d9f
#include "../../../devices/moore/moore_common.h"
#include "../../../devices/moore/moore_handle.h"
#include "silu_and_mul_moore.h"
#include <musa_bf16.h>
#include <memory>
namespace op::silu_and_mul::moore {
struct Descriptor::Opaque {
std::shared_ptr<device::moore::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc) {
if (!desc_ptr) {
return INFINI_STATUS_BAD_PARAM;
}
auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
auto dtype = y_desc->dtype();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
if (x_desc->dtype() != dtype) {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
auto result = SiluAndMulInfo::create(y_desc, x_desc);
CHECK_RESULT(result);
auto info = result.take();
*desc_ptr = new Descriptor(
new Opaque{handle->internal()},
std::move(info),
0,
handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <typename T>
infiniStatus_t calculate_impl(
const SiluAndMulInfo &info,
std::shared_ptr<device::moore::Handle::Internal> &internal,
void *y,
const void *x,
void *stream) {
return internal->useMudnn(
(musaStream_t)stream,
[&](::musa::dnn::Handle &mudnn_handle) -> infiniStatus_t {
::musa::dnn::Tensor x_t, y_t;
if constexpr (std::is_same_v<T, half>) {
x_t.SetType(::musa::dnn::Tensor::Type::HALF);
y_t.SetType(::musa::dnn::Tensor::Type::HALF);
} else if constexpr (std::is_same_v<T, __mt_bfloat16>) {
x_t.SetType(::musa::dnn::Tensor::Type::BFLOAT16);
y_t.SetType(::musa::dnn::Tensor::Type::BFLOAT16);
} else {
x_t.SetType(::musa::dnn::Tensor::Type::FLOAT);
y_t.SetType(::musa::dnn::Tensor::Type::FLOAT);
}
x_t.SetAddr(const_cast<void *>(x));
y_t.SetAddr(y);
// --- Construct 2D dimension information ---
// Explicitly distinguish between Batch and Hidden dimensions
int64_t b = static_cast<int64_t>(info.batch_size);
int64_t h = static_cast<int64_t>(info.out_hidden_dim);
// Input x logical shape is [batch, 2 * hidden]
std::array<int64_t, 2> x_dims = {b, h * 2};
std::array<int64_t, 2> x_strides = {h * 2, 1};
// Output y logical shape is [batch, hidden]
std::array<int64_t, 2> y_dims = {b, h};
std::array<int64_t, 2> y_strides = {h, 1};
x_t.SetNdInfo(2, x_dims.data(), x_strides.data());
y_t.SetNdInfo(2, y_dims.data(), y_strides.data());
// Invoke muDNN SwiGLU
// muDNN will split each row (length 2*h) internally,
// muDNN treats the first h elements of input x as the 'gate'
// and the following h elements as the 'up' projection.
::musa::dnn::SwiGlu swiglu;
swiglu.Run(mudnn_handle, y_t, x_t);
return INFINI_STATUS_SUCCESS;
});
}
infiniStatus_t Descriptor::calculate(
void *workspace, size_t workspace_size,
void *y, const void *x,
void *stream) const {
infiniDtype_t dtype = _info.dtype;
switch (dtype) {
case INFINI_DTYPE_F16:
return calculate_impl<half>(_info, _opaque->internal, y, x, stream);
case INFINI_DTYPE_F32:
return calculate_impl<float>(_info, _opaque->internal, y, x, stream);
case INFINI_DTYPE_BF16:
return calculate_impl<__mt_bfloat16>(_info, _opaque->internal, y, x, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
} // namespace op::silu_and_mul::moore
src/infiniop/ops/silu_and_mul/operator.cc
0 → 100644
View file @
54635d9f
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/silu_and_mul.h"
#ifdef ENABLE_MOORE_API
#include "moore/silu_and_mul_moore.h"
#endif
__C
infiniStatus_t
infiniopCreateSiluAndMulDescriptor
(
infiniopHandle_t
handle
,
infiniopSiluAndMulDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
)
{
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::silu_and_mul::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::silu_and_mul::NAMESPACE::Descriptor **>(desc_ptr), \
y_desc, \
x_desc);
switch
(
handle
->
device
)
{
#ifdef ENABLE_MOORE_API
CREATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
}
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
__C
infiniStatus_t
infiniopGetSiluAndMulWorkspaceSize
(
infiniopSiluAndMulDescriptor_t
desc
,
size_t
*
size
)
{
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<const op::silu_and_mul::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_MOORE_API
GET
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
}
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
__C
infiniStatus_t
infiniopSiluAndMul
(
infiniopSiluAndMulDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
void
*
stream
)
{
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::silu_and_mul::NAMESPACE::Descriptor *>(desc)->calculate( \
workspace, workspace_size, y, x, stream);
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_MOORE_API
CALCULATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
}
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
__C
infiniStatus_t
infiniopDestroySiluAndMulDescriptor
(
infiniopSiluAndMulDescriptor_t
desc
)
{
#define DESTROY(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::silu_and_mul::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_MOORE_API
DESTROY
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
}
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
src/infiniop/ops/silu_and_mul/silu_and_mul.h
0 → 100644
View file @
54635d9f
#ifndef SILU_AND_MUL_H
#define SILU_AND_MUL_H
#include "../../operator.h"
#include "info.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::silu_and_mul::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
SiluAndMulInfo _info; \
size_t _workspace_size; \
\
Descriptor( \
Opaque *opaque, \
SiluAndMulInfo info, \
size_t workspace_size, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size) {} \
\
public: \
~Descriptor(); \
\
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t y_desc, \
infiniopTensorDescriptor_t x_desc); \
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
void *y, \
const void *x, \
void *stream) const; \
}; \
}
#endif // SILU_AND_MUL_H
test/infinicore/ops/silu_and_mul.py
0 → 100644
View file @
54635d9f
import
sys
import
os
sys
.
path
.
insert
(
0
,
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
))
import
torch
import
infinicore
from
framework
import
(
BaseOperatorTest
,
TensorSpec
,
TestCase
,
GenericTestRunner
,
is_broadcast
,
)
# ==============================================================================
# Operator-specific configuration
# ==============================================================================
# Test cases format: (input_shape)
# The operator splits the last dimension: Input (..., 2*d) -> Output (..., d)
_TEST_CASES_DATA
=
[
(
2
,
4
),
(
1024
,
1024
),
(
2
,
4
,
8
),
(
1
,
22016
),
(
2
,
4
,
256
),
(
2
,
4
,
16
,
256
),
]
# Tolerance configuration for different precisions
_TOLERANCE_MAP
=
{
infinicore
.
float16
:
{
"atol"
:
1e-3
,
"rtol"
:
1e-3
},
infinicore
.
float32
:
{
"atol"
:
1e-5
,
"rtol"
:
1e-5
},
infinicore
.
bfloat16
:
{
"atol"
:
5e-3
,
"rtol"
:
1e-2
},
}
_TENSOR_DTYPES
=
[
infinicore
.
float16
,
infinicore
.
bfloat16
,
infinicore
.
float32
]
def
parse_test_cases
():
"""
Parse SiLUAndMul test case data.
Input shape: [..., 2*d], Output shape: [..., d]
Note: In-place is not supported due to shape mismatch between input and output.
"""
test_cases
=
[]
for
input_shape
in
_TEST_CASES_DATA
:
# Calculate output shape based on SwiGLU logic
output_shape
=
list
(
input_shape
)
output_shape
[
-
1
]
//=
2
output_shape
=
tuple
(
output_shape
)
for
dtype
in
_TENSOR_DTYPES
:
tolerance
=
_TOLERANCE_MAP
.
get
(
dtype
,
{
"atol"
:
1e-5
,
"rtol"
:
1e-4
})
input_spec
=
TensorSpec
.
from_tensor
(
input_shape
,
None
,
dtype
)
output_spec
=
TensorSpec
.
from_tensor
(
output_shape
,
None
,
dtype
)
# Case 1: Functional style (allocates new memory for output)
test_cases
.
append
(
TestCase
(
inputs
=
[
input_spec
],
kwargs
=
{},
output_spec
=
None
,
comparison_target
=
None
,
tolerance
=
tolerance
,
description
=
f
"SiLUAndMul_Functional_
{
dtype
}
"
,
)
)
# Case 2: Explicit output tensor style (uses pre-allocated buffer)
test_cases
.
append
(
TestCase
(
inputs
=
[
input_spec
],
kwargs
=
None
,
output_spec
=
output_spec
,
comparison_target
=
"out"
,
tolerance
=
tolerance
,
description
=
f
"SiLUAndMul_OutParam_
{
dtype
}
"
,
)
)
return
test_cases
class
OpTest
(
BaseOperatorTest
):
"""SiLUAndMul operator test (SwiGLU activation)"""
def
__init__
(
self
):
super
().
__init__
(
"SiLUAndMul"
)
def
get_test_cases
(
self
):
return
parse_test_cases
()
def
torch_operator
(
self
,
input
,
out
=
None
,
**
kwargs
):
"""
PyTorch SwiGLU reference implementation:
Formula: SiLU(gate) * up, where [gate, up] = split(input)
"""
d
=
input
.
shape
[
-
1
]
//
2
# Split the last dimension into two equal parts
gate
,
up
=
torch
.
split
(
input
,
[
d
,
d
],
dim
=-
1
)
result
=
torch
.
nn
.
functional
.
silu
(
gate
)
*
up
if
out
is
not
None
:
out
.
copy_
(
result
)
return
out
return
result
def
infinicore_operator
(
self
,
input
,
out
=
None
,
**
kwargs
):
"""InfiniCore SiLUAndMul implementation wrapper"""
import
infinicore.nn.functional
as
F
return
F
.
silu_and_mul
(
input
,
out
=
out
)
def
main
():
"""Main entry point for the test runner"""
runner
=
GenericTestRunner
(
OpTest
)
runner
.
run_and_exit
()
if
__name__
==
"__main__"
:
main
()
test/infiniop/libinfiniop/op_register.py
View file @
54635d9f
...
...
@@ -1181,3 +1181,35 @@ def paged_attention_prefill_(lib):
lib
.
infiniopDestroyPagedAttentionPrefillDescriptor
.
argtypes
=
[
infiniopOperatorDescriptor_t
,
]
@
OpRegister
.
operator
def
silu_and_mul
(
lib
):
lib
.
infiniopCreateSiluAndMulDescriptor
.
restype
=
c_int32
lib
.
infiniopCreateSiluAndMulDescriptor
.
argtypes
=
[
infiniopHandle_t
,
POINTER
(
infiniopOperatorDescriptor_t
),
infiniopTensorDescriptor_t
,
infiniopTensorDescriptor_t
,
]
lib
.
infiniopGetSiluAndMulWorkspaceSize
.
restype
=
c_int32
lib
.
infiniopGetSiluAndMulWorkspaceSize
.
argtypes
=
[
infiniopOperatorDescriptor_t
,
POINTER
(
c_size_t
),
]
lib
.
infiniopSiluAndMul
.
restype
=
c_int32
lib
.
infiniopSiluAndMul
.
argtypes
=
[
infiniopOperatorDescriptor_t
,
c_void_p
,
c_size_t
,
c_void_p
,
c_void_p
,
c_void_p
,
]
lib
.
infiniopDestroySiluAndMulDescriptor
.
restype
=
c_int32
lib
.
infiniopDestroySiluAndMulDescriptor
.
argtypes
=
[
infiniopOperatorDescriptor_t
,
]
test/infiniop/silu_and_mul.py
0 → 100644
View file @
54635d9f
import
torch
import
ctypes
from
ctypes
import
c_uint64
from
libinfiniop
import
(
LIBINFINIOP
,
TestTensor
,
get_test_devices
,
check_error
,
test_operator
,
get_args
,
debug
,
get_tolerance
,
profile_operation
,
TestWorkspace
,
InfiniDtype
,
InfiniDtypeNames
,
InfiniDeviceNames
,
infiniopOperatorDescriptor_t
,
)
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# Format: (input_shape, output_shape)
# Referencing vLLM kernel Silu_and_Mul interface:
# input_shape is [..., 2*d], output_shape is [..., d]
_TEST_CASES
=
[
# input_shape, output_shape
((
2
,
8
),
(
2
,
4
)),
((
1024
,
1024
),
(
1024
,
512
)),
((
16
,
8192
),
(
16
,
4096
)),
((
2
,
128
,
2048
),
(
2
,
128
,
1024
)),
((
8
,
1
,
4096
),
(
8
,
1
,
2048
)),
((
2
,
4
,
16
,
256
),
(
2
,
4
,
16
,
128
)),
]
_TENSOR_DTYPES
=
[
InfiniDtype
.
F16
,
InfiniDtype
.
BF16
,
InfiniDtype
.
F32
]
_TOLERANCE_MAP
=
{
InfiniDtype
.
F16
:
{
"atol"
:
1e-3
,
"rtol"
:
1e-3
},
InfiniDtype
.
F32
:
{
"atol"
:
1e-6
,
"rtol"
:
1e-6
},
InfiniDtype
.
BF16
:
{
"atol"
:
1e-2
,
"rtol"
:
1e-2
},
}
DEBUG
=
False
PROFILE
=
False
NUM_PRERUN
=
10
NUM_ITERATIONS
=
100
# PyTorch reference: silu(gate) * up where [gate, up] = split(input)
def
silu_and_mul_torch
(
out
,
input_tensor
):
"""
Computes the SwiGLU activation function: SiLU(gate) * up.
"""
# Split the last dimension into two halves:
# the first half is 'gate', the second is 'up'
d
=
input_tensor
.
shape
[
-
1
]
//
2
gate
=
input_tensor
[...,
:
d
]
up
=
input_tensor
[...,
d
:]
# Apply SiLU to the gate and multiply by the up projection
torch
.
mul
(
torch
.
nn
.
functional
.
silu
(
gate
),
up
,
out
=
out
)
# ==============================================================================
# Test Logic
# ==============================================================================
def
test
(
handle
,
device
,
input_shape
,
output_shape
,
dtype
=
InfiniDtype
.
F16
,
sync
=
None
,
):
print
(
f
"Testing SiluAndMul on
{
InfiniDeviceNames
[
device
]
}
with "
f
"input_shape:
{
input_shape
}
output_shape:
{
output_shape
}
dtype:
{
InfiniDtypeNames
[
dtype
]
}
"
)
a
=
TestTensor
(
input_shape
,
None
,
dtype
,
device
)
c
=
TestTensor
(
output_shape
,
None
,
dtype
,
device
,
mode
=
"zeros"
)
ans
=
TestTensor
(
output_shape
,
None
,
dtype
,
device
,
mode
=
"zeros"
)
# Only support contiguous Tensor
if
not
(
a
.
torch_tensor
().
is_contiguous
()
and
c
.
torch_tensor
().
is_contiguous
()
and
ans
.
torch_tensor
().
is_contiguous
()
):
raise
ValueError
(
"This operator only supports contiguous memory layout."
)
# PyTorch answer reference
def
torch_silu_and_mul_reference
():
silu_and_mul_torch
(
ans
.
torch_tensor
(),
a
.
torch_tensor
())
torch_silu_and_mul_reference
()
if
sync
is
not
None
:
sync
()
descriptor
=
infiniopOperatorDescriptor_t
()
check_error
(
LIBINFINIOP
.
infiniopCreateSiluAndMulDescriptor
(
handle
,
ctypes
.
byref
(
descriptor
),
c
.
descriptor
,
a
.
descriptor
,
)
)
for
tensor
in
[
a
,
c
]:
tensor
.
destroy_desc
()
# Workspace
workspace_size
=
c_uint64
(
0
)
check_error
(
LIBINFINIOP
.
infiniopGetSiluAndMulWorkspaceSize
(
descriptor
,
ctypes
.
byref
(
workspace_size
)
)
)
workspace
=
TestWorkspace
(
workspace_size
.
value
,
device
)
def
lib_op
():
check_error
(
LIBINFINIOP
.
infiniopSiluAndMul
(
descriptor
,
workspace
.
data
(),
workspace_size
.
value
,
c
.
data
(),
a
.
data
(),
None
,
)
)
lib_op
()
atol
,
rtol
=
get_tolerance
(
_TOLERANCE_MAP
,
dtype
)
if
DEBUG
:
debug
(
c
.
actual_tensor
(),
ans
.
torch_tensor
(),
atol
=
atol
,
rtol
=
rtol
)
assert
torch
.
allclose
(
c
.
actual_tensor
(),
ans
.
torch_tensor
(),
atol
=
atol
,
rtol
=
rtol
)
# Profiling workflow
if
PROFILE
:
profile_operation
(
"PyTorch"
,
lambda
:
torch_silu_and_mul_reference
(),
device
,
NUM_PRERUN
,
NUM_ITERATIONS
,
)
profile_operation
(
" lib"
,
lambda
:
lib_op
(),
device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
check_error
(
LIBINFINIOP
.
infiniopDestroySiluAndMulDescriptor
(
descriptor
))
# ==============================================================================
# Main Execution
# ==============================================================================
if
__name__
==
"__main__"
:
args
=
get_args
()
DEBUG
=
args
.
debug
PROFILE
=
args
.
profile
NUM_PRERUN
=
args
.
num_prerun
NUM_ITERATIONS
=
args
.
num_iterations
for
device
in
get_test_devices
(
args
):
test_operator
(
device
,
test
,
_TEST_CASES
,
_TENSOR_DTYPES
)
print
(
"
\033
[92mSiluAndMul Test passed!
\033
[0m"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment