Unverified Commit 65df17f7 authored by PanZezhong1725's avatar PanZezhong1725 Committed by GitHub
Browse files

Merge pull request #47 from YdrMaster/swiglu-cpu

issue/46: swiglu 算子 - CPU
parents 9daba5b8 ee68b55d
......@@ -32,7 +32,7 @@ jobs:
xmake-version: latest
- name: configure xmake
run: xmake f -cv
run: xmake f --omp=y -cv
- name: build with xmake
run: xmake build
......
#ifndef __INFINIOP_BINARY_H__
#define __INFINIOP_BINARY_H__
#include "../operator.h"
#include "../tensor.h"
#include <numeric>
/**
* 该类的设计基于 matmul.h 中 YdrMaster 设计的 DESCRIPTOR 宏。
*/
#define BINARY_DESCRIPTOR(OP, NAMESPACE) \
\
namespace op::OP::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
infiniDtype_t _dtype; \
op::binary::BinaryInfo _info; \
\
Descriptor( \
infiniDtype_t dtype, \
op::binary::BinaryInfo info, \
Opaque *opaque, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_dtype(dtype), \
_info(info) {} \
\
public: \
~Descriptor(); \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t c_desc, \
infiniopTensorDescriptor_t a_desc, \
infiniopTensorDescriptor_t b_desc); \
\
infiniStatus_t calculate( \
void *c, \
const void *a, \
const void *b, \
void *stream) const; \
}; \
}
namespace op::binary {
// Stores metadata for binary operations on CPU
struct BinaryInfo {
size_t c_data_size;
size_t ndim;
bool contiguous;
bool broadcasted;
std::vector<size_t> c_shape;
std::vector<size_t> a_shape;
std::vector<size_t> b_shape;
std::vector<ptrdiff_t> c_strides;
std::vector<ptrdiff_t> a_strides;
std::vector<ptrdiff_t> b_strides;
};
inline infiniStatus_t createBinaryInfo(BinaryInfo &info,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc) {
if (!c_desc || !a_desc || !b_desc) {
return INFINI_STATUS_BAD_PARAM;
}
info.c_data_size = c_desc->numel();
info.ndim = c_desc->ndim();
info.contiguous = c_desc->isContiguous() && a_desc->isContiguous() && b_desc->isContiguous();
// Destination cannot have broadcast setup
if (c_desc->hasBroadcastDim()) {
return INFINI_STATUS_BAD_TENSOR_STRIDES;
}
const bool ndim_match = (c_desc->ndim() == a_desc->ndim()) && (c_desc->ndim() == b_desc->ndim());
info.broadcasted = !info.contiguous && (!ndim_match || a_desc->hasBroadcastDim() || b_desc->hasBroadcastDim());
info.c_shape = std::move(c_desc->shape());
info.a_shape = std::move(a_desc->shape());
info.b_shape = std::move(b_desc->shape());
info.c_strides = std::move(c_desc->strides());
info.a_strides = std::move(a_desc->strides());
info.b_strides = std::move(b_desc->strides());
return INFINI_STATUS_SUCCESS;
}
} // namespace op::binary
#endif // __INFINIOP_BINARY_H__
#ifndef __INFINIOP_BINARY_CPU_H__
#define __INFINIOP_BINARY_CPU_H__
#include "../../devices/cpu/common_cpu.h"
#include "../binary.h"
#include <utility>
namespace op::common_cpu {
namespace binary_op {
// Perform binary computation when inputs and the output can have different dtypes
template <typename Tc, typename Ta, typename Tb, typename BinaryOp, typename... Args>
void calculate(op::binary::BinaryInfo info, void *c, const void *a, const void *b, Args &&...args) {
auto a_ = reinterpret_cast<const Ta *>(a);
auto b_ = reinterpret_cast<const Tb *>(b);
auto c_ = reinterpret_cast<Tc *>(c);
ptrdiff_t data_size = info.c_data_size;
#pragma omp parallel for
for (ptrdiff_t i = 0; i < data_size; ++i) {
size_t a_index = info.contiguous ? i : (info.broadcasted ? op::common_cpu::indexToReducedOffset(i, info.ndim, info.c_strides.data(), info.a_strides.data()) : op::common_cpu::indexToOffset(i, info.ndim, info.a_shape.data(), info.a_strides.data()));
size_t b_index = info.contiguous ? i : (info.broadcasted ? op::common_cpu::indexToReducedOffset(i, info.ndim, info.c_strides.data(), info.b_strides.data()) : op::common_cpu::indexToOffset(i, info.ndim, info.b_shape.data(), info.b_strides.data()));
size_t c_index = info.contiguous ? i : (op::common_cpu::indexToOffset(i, info.ndim, info.c_shape.data(), info.c_strides.data()));
c_[c_index] = BinaryOp{}(a_[a_index], b_[b_index], std::forward<Args>(args)...);
}
}
// Perform binary computation when all inputs and the output share the same dtype
template <typename Tdata, typename BinaryOp, typename... Args>
void calculate(op::binary::BinaryInfo info, void *c, const void *a, const void *b, Args &&...args) {
auto a_ = reinterpret_cast<const Tdata *>(a);
auto b_ = reinterpret_cast<const Tdata *>(b);
auto c_ = reinterpret_cast<Tdata *>(c);
ptrdiff_t data_size = info.c_data_size;
#pragma omp parallel for
for (ptrdiff_t i = 0; i < data_size; ++i) {
size_t a_index = info.contiguous ? i : (info.broadcasted ? op::common_cpu::indexToReducedOffset(i, info.ndim, info.c_strides.data(), info.a_strides.data()) : op::common_cpu::indexToOffset(i, info.ndim, info.a_shape.data(), info.a_strides.data()));
size_t b_index = info.contiguous ? i : (info.broadcasted ? op::common_cpu::indexToReducedOffset(i, info.ndim, info.c_strides.data(), info.b_strides.data()) : op::common_cpu::indexToOffset(i, info.ndim, info.b_shape.data(), info.b_strides.data()));
size_t c_index = info.contiguous ? i : (op::common_cpu::indexToOffset(i, info.ndim, info.c_shape.data(), info.c_strides.data()));
if constexpr (std::is_same_v<Tdata, fp16_t>) {
float a_val = utils::cast<float>(a_[a_index]);
float b_val = utils::cast<float>(b_[b_index]);
c_[c_index] = utils::cast<fp16_t>(BinaryOp{}(a_val, b_val, std::forward<Args>(args)...));
} else {
c_[c_index] = BinaryOp{}(a_[a_index], b_[b_index], std::forward<Args>(args)...);
}
}
}
} // namespace binary_op
} // namespace op::common_cpu
#endif // __INFINIOP_BINARY_CPU_H__
#include "common_cpu.h"
namespace op::common_cpu {
size_t indexToReducedOffset(
size_t flat_index,
size_t ndim,
......@@ -19,7 +21,7 @@ size_t indexToOffset(
const size_t *shape,
const ptrdiff_t *strides) {
size_t res = 0;
for (size_t i = ndim; i-- >= 0;) {
for (size_t i = ndim; i-- > 0;) {
res += (flat_index % shape[i]) * strides[i];
flat_index /= shape[i];
}
......@@ -48,3 +50,5 @@ std::vector<size_t> getPaddedShape(
}
return padded_shape;
}
} // namespace op::common_cpu
......@@ -13,6 +13,8 @@
#include <omp.h>
#endif
namespace op::common_cpu {
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
size_t indexToReducedOffset(size_t flat_index, size_t ndim, const ptrdiff_t *broadcasted_strides, const ptrdiff_t *target_strides);
......@@ -28,4 +30,6 @@ size_t getPaddedSize(size_t ndim, size_t *shape, const size_t *pads);
// calculate the padded shape and store the result in padded_shape
std::vector<size_t> getPaddedShape(size_t ndim, const size_t *shape, const size_t *pads);
} // namespace op::common_cpu
#endif // __INFINIOP__COMMON_CPU_H__
#include "swiglu_cpu.h"
namespace op::swiglu::cpu {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
infiniopTensorDescriptor_t up_desc,
infiniopTensorDescriptor_t gate_desc) {
auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &out_shape = out_desc->shape();
const auto &up_shape = up_desc->shape();
const auto &gate_shape = gate_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
if (!SAME_VEC(out_shape, up_shape, gate_shape)) {
return INFINI_STATUS_BAD_TENSOR_SHAPE;
}
op::binary::BinaryInfo info;
CHECK_STATUS(op::binary::createBinaryInfo(info, out_desc, up_desc, gate_desc));
// Create descriptor
*desc_ptr = new Descriptor(
dtype,
std::move(info),
nullptr,
handle->device,
handle->device_id);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *c,
const void *a,
const void *b,
void *stream) const {
switch (_dtype) {
case INFINI_DTYPE_F16:
op::common_cpu::binary_op::calculate<fp16_t, SwiGLUOp>(_info, c, a, b);
break;
case INFINI_DTYPE_F32:
op::common_cpu::binary_op::calculate<float, SwiGLUOp>(_info, c, a, b);
break;
case INFINI_DTYPE_F64:
op::common_cpu::binary_op::calculate<double, SwiGLUOp>(_info, c, a, b);
break;
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::swiglu::cpu
#ifndef __SWIGLU_CPU_H__
#define __SWIGLU_CPU_H__
#include "../../../binary/cpu/binary_cpu.h"
BINARY_DESCRIPTOR(swiglu, cpu)
struct SwiGLUOp {
private:
template <typename T>
T sigmoid(const T &x) const {
return 1 / (1 + std::exp(-x));
}
public:
template <typename T>
T operator()(const T &up, const T &gate) const {
return gate * sigmoid(gate) * up;
}
};
#endif // __SWIGLU_CPU_H__
......@@ -2,15 +2,30 @@
#include "../../handle.h"
#include "infiniop/ops/swiglu.h"
#ifdef ENABLE_CPU_API
#include "cpu/swiglu_cpu.h"
#endif
__C infiniStatus_t infiniopCreateSwiGLUDescriptor(
infiniopHandle_t handle, infiniopSwiGLUDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc,
infiniopHandle_t handle,
infiniopSwiGLUDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::swiglu::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::swiglu::NAMESPACE::Descriptor **>(desc_ptr), \
c_desc, \
a_desc, \
b_desc)
switch (handle->device) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuCreateSwiGLUDescriptor(
handle, (SwiGLUCpuDescriptor_t *)desc_ptr, c_desc, a_desc, b_desc);
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu:
......@@ -43,17 +58,30 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
return musaCreateSwiGLUDescriptor(
handle, (SwiGLUMusaDescriptor_t *)desc_ptr, c_desc, a_desc, b_desc);
#endif
}
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
};
}
#undef CREATE
}
__C infiniStatus_t infiniopSwiGLU(infiniopSwiGLUDescriptor_t desc, void *c,
const void *a, const void *b,
__C infiniStatus_t infiniopSwiGLU(
infiniopSwiGLUDescriptor_t desc,
void *c,
const void *a,
const void *b,
void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::swiglu::NAMESPACE::Descriptor *>(desc) \
->calculate(c, a, b, stream)
switch (desc->device_type) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuSwiGLU((SwiGLUCpuDescriptor_t)desc, c, a, b, stream);
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu:
......@@ -76,16 +104,26 @@ __C infiniStatus_t infiniopSwiGLU(infiniopSwiGLUDescriptor_t desc, void *c,
case DevMthreadsGpu:
return musaSwiGLU((SwiGLUMusaDescriptor_t)desc, c, a, b, stream);
#endif
}
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CALCULATE
}
__C infiniStatus_t
infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::swiglu::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuDestroySwiGLUDescriptor((SwiGLUCpuDescriptor_t)desc);
#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu:
......@@ -108,6 +146,10 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
case DevMthreadsGpu:
return musaDestroySwiGLUDescriptor((SwiGLUMusaDescriptor_t)desc);
#endif
}
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef DELETE
}
......@@ -28,6 +28,10 @@ public:
bool isContiguous() const;
size_t numel() const;
// a dim is broadcasted if it's corresponding stride is 0 but dim > 1
bool hasBroadcastDim() const;
std::vector<size_t> getBroadcastDim() const;
infiniopTensorDescriptor_t dimMerge(size_t dim_start, size_t dim_end) const;
infiniopTensorDescriptor_t dimSplit(size_t axis, const std::vector<size_t> &dims) const;
infiniopTensorDescriptor_t dimPermute(const std::vector<size_t> &order) const;
......
#include "../utils.h"
#include "tensor.h"
#include <algorithm>
#include <cstring>
#include <functional>
#include <numeric>
......@@ -85,6 +86,24 @@ bool InfiniopTensorDescriptor::isContiguous() const {
return isContiguous(0, ndim() - 1);
}
bool InfiniopTensorDescriptor::hasBroadcastDim() const {
return std::any_of(
_shape.begin(), _shape.end(),
[&, i = 0](const auto &) mutable {
return _shape[i] != 1 && _strides[i++] == 0;
});
}
std::vector<size_t> InfiniopTensorDescriptor::getBroadcastDim() const {
std::vector<size_t> res;
for (size_t i = 0; i < ndim(); ++i) {
if (_shape[i] != 1 && _strides[i] == 0) {
res.push_back(i);
}
}
return res;
}
infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimMerge(size_t dim_start, size_t dim_end) const {
if (dim_start > dim_end || dim_end >= ndim()) {
return nullptr;
......
#ifndef INFINIUTILS_CHECK_H
#define INFINIUTILS_CHECK_H
#include <iostream>
#include <tuple>
#define CHECK_API_OR(API, EXPECT, ACTION) \
do { \
......@@ -30,4 +31,13 @@
return INFINI_STATUS_BAD_TENSOR_DTYPE); \
} while (0)
#define SAME_VEC(...) \
[&] { \
auto &&_vec = std::forward_as_tuple(__VA_ARGS__); \
const auto &_base = std::get<0>(_vec); \
return [&_base](auto &&...args) { \
return ((args == _base) && ...); \
}(__VA_ARGS__); \
}()
#endif // INFINIUTILS_CHECK_H
......@@ -25,19 +25,25 @@ _TEST_CASES_ = [
# shape, a_stride, b_stride, c_stride
((13, 4), None, None, None),
((13, 4), (10, 1), (10, 1), (10, 1)),
# ((13, 4, 4), None, None, None),
# ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
((13, 4, 4), None, None, None),
((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
((16, 5632), None, None, None),
((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
# ((4, 4, 5632), None, None, None),
# ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
((4, 4, 5632), None, None, None),
((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
]
class Inplace(Enum):
OUT_OF_PLACE = auto()
INPLACE_A = auto()
INPLACE_B = auto()
# Inplace options applied for each test case in _TEST_CASES_
_INPLACE = [
"Inplace.OUT_OF_PLACE",
"Inplace.INPLACE_A",
"Inplace.INPLACE_B",
Inplace.OUT_OF_PLACE,
Inplace.INPLACE_A,
Inplace.INPLACE_B,
]
# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
......@@ -48,7 +54,7 @@ _TEST_CASES = [
]
# Data types used for testing
_TENSOR_DTYPES = [torch.float16]
_TENSOR_DTYPES = [torch.float16, torch.float32]
# Tolerance map for different data types
_TOLERANCE_MAP = {
......@@ -61,12 +67,6 @@ NUM_PRERUN = 10
NUM_ITERATIONS = 1000
class Inplace(Enum):
OUT_OF_PLACE = auto()
INPLACE_A = auto()
INPLACE_B = auto()
class SwiGLUDescriptor(Structure):
_fields_ = [("device", c_int32)]
......@@ -132,7 +132,7 @@ def test(
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [a_tensor, b_tensor, c_tensor]:
tensor.descriptor.contents.invalidate()
tensor.destroyDesc(lib)
def lib_swiglu():
check_error(
......
......@@ -37,3 +37,8 @@ target("infinirt-cpu")
set_languages("cxx17")
add_files("../src/infinirt/cpu/*.cc")
target_end()
if has_config("omp") then
add_requires("openmp")
add_packages("openmp")
end
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment