Commit c2e87202 authored by Catheriany's avatar Catheriany
Browse files

Merge remote-tracking branch 'origin/main' into issue/142

parents 41818f84 c203635b
#ifndef __ACLNN_SWIGLU_H__
#define __ACLNN_SWIGLU_H__
#include "../../../../utils.h"
#include "../../../../utils/check.h"
#include "../../../operator.h"
#include "../../../tensor.h"
namespace op::swiglu::ascend {
class SwigluInfo {
SwigluInfo() = default;
public:
infiniDtype_t dtype;
std::vector<size_t> shape;
int32_t ndim;
std::vector<ptrdiff_t> c_strides;
std::vector<ptrdiff_t> a_strides;
std::vector<ptrdiff_t> b_strides;
static utils::Result<SwigluInfo> create(infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t b_desc) {
CHECK_OR_RETURN(c_desc && a_desc && b_desc, INFINI_STATUS_BAD_PARAM);
CHECK_OR_RETURN(!c_desc->hasBroadcastDim(), INFINI_STATUS_BAD_TENSOR_STRIDES);
CHECK_OR_RETURN(c_desc->ndim() == a_desc->ndim()
&& c_desc->ndim() == b_desc->ndim()
&& (c_desc->ndim() == 2 || c_desc->ndim() == 3),
INFINI_STATUS_BAD_TENSOR_SHAPE);
CHECK_SAME_SHAPE(c_desc->shape(), a_desc->shape(), b_desc->shape());
int32_t ndim = c_desc->ndim();
CHECK_OR_RETURN(c_desc->stride(ndim - 1) == 1
&& a_desc->stride(ndim - 1) == 1
&& b_desc->stride(ndim - 1) == 1,
INFINI_STATUS_BAD_TENSOR_STRIDES);
CHECK_OR_RETURN(c_desc->dtype() == a_desc->dtype()
&& c_desc->dtype() == b_desc->dtype(),
INFINI_STATUS_BAD_TENSOR_DTYPE);
return utils::Result<SwigluInfo>(SwigluInfo{
c_desc->dtype(),
c_desc->shape(),
ndim,
c_desc->strides(),
a_desc->strides(),
b_desc->strides(),
});
}
};
class Descriptor final : public InfiniopDescriptor {
SwigluInfo _info;
size_t _workspace_size;
Descriptor(SwigluInfo info, size_t workspace_size, infiniDevice_t device_type, int device_id) : InfiniopDescriptor{device_type, device_id},
_info(info), _workspace_size(workspace_size) {}
public:
~Descriptor();
static infiniStatus_t create(infiniopHandle_t handle, Descriptor **desc_ptr,
infiniopTensorDescriptor_t c_desc,
std::vector<infiniopTensorDescriptor_t> input_descs);
size_t workspaceSize() const { return _workspace_size; }
infiniStatus_t calculate(
void *workspace,
size_t workspace_size,
void *c,
std::vector<const void *> inputs,
void *stream) const;
};
extern "C" infiniStatus_t swiglu_kernel_launch(
void *c, void *a, void *b,
infiniDtype_t dtype, size_t batch, size_t seq, size_t hd,
ptrdiff_t stride_batch_c, ptrdiff_t stride_batch_a, ptrdiff_t stride_batch_b,
ptrdiff_t stride_seq_c, ptrdiff_t stride_seq_a, ptrdiff_t stride_seq_b, void *stream);
} // namespace op::swiglu::ascend
#endif // __ACLNN_SWIGLU_H__
#include "../../../devices/ascend/ascend_kernel_common.h"
using namespace AscendC;
template <typename T>
class SwigluKernel {
public:
__aicore__ inline SwigluKernel() {}
__aicore__ inline void init(GM_ADDR c, GM_ADDR a, GM_ADDR b,
size_t batch_, size_t seq, size_t hd,
ptrdiff_t stride_batch_c,
ptrdiff_t stride_batch_a,
ptrdiff_t stride_batch_b,
ptrdiff_t stride_seq_c,
ptrdiff_t stride_seq_a,
ptrdiff_t stride_seq_b);
__aicore__ inline void process();
private:
__aicore__ inline void copyIn(size_t i);
__aicore__ inline void compute(size_t i);
__aicore__ inline void copyOut(size_t i);
private:
GlobalTensor<T> _c_gm, _a_gm, _b_gm;
TQue<QuePosition::VECIN, BUFFER_NUM> _in_queue_a, _in_queue_b;
TQue<QuePosition::VECOUT, BUFFER_NUM> _out_queue_c;
TPipe _pipe;
float _beta_value = 1.0f;
size_t _block_idx, _tile_len, _copy_len,
_batch, _seq_len, _hidden_size,
_stride_seq_a, _stride_seq_b, _stride_seq_c;
int64_t _stride_batch_a = 1, _stride_batch_b = 1, _stride_batch_c = 1;
};
template <typename T>
__aicore__ inline void SwigluKernel<T>::init(GM_ADDR c, GM_ADDR a, GM_ADDR b,
size_t batch_, size_t seq, size_t hd,
ptrdiff_t stride_batch_c,
ptrdiff_t stride_batch_a,
ptrdiff_t stride_batch_b,
ptrdiff_t stride_seq_c,
ptrdiff_t stride_seq_a,
ptrdiff_t stride_seq_b) {
// Init Shape & StrideVariables
_batch = batch_;
_seq_len = seq;
_hidden_size = hd;
_stride_batch_a = stride_batch_a;
_stride_batch_b = stride_batch_b;
_stride_batch_c = stride_batch_c;
_stride_seq_a = stride_seq_a;
_stride_seq_b = stride_seq_b;
_stride_seq_c = stride_seq_c;
_block_idx = GetBlockIdx();
_tile_len = _block_idx < (_hidden_size % BLOCK_NUM) ? (_hidden_size / BLOCK_NUM) + 1 : (_hidden_size / BLOCK_NUM);
_copy_len = alignTileLen<T>(_tile_len, BYTE_ALIGN);
// Set global tensor
_a_gm.SetGlobalBuffer((__gm__ T *)a);
_b_gm.SetGlobalBuffer((__gm__ T *)b);
_c_gm.SetGlobalBuffer((__gm__ T *)c);
// _pipe alloc memory to queue, the unit is bytes
_pipe.InitBuffer(_in_queue_a, BUFFER_NUM, _copy_len * sizeof(T));
_pipe.InitBuffer(_in_queue_b, BUFFER_NUM, _copy_len * sizeof(T));
_pipe.InitBuffer(_out_queue_c, BUFFER_NUM, _copy_len * sizeof(T));
}
template <typename T>
__aicore__ inline void SwigluKernel<T>::copyIn(size_t i) {
// Alloc tensor from queue memory
LocalTensor<T> aLocal = _in_queue_a.AllocTensor<T>();
LocalTensor<T> bLocal = _in_queue_b.AllocTensor<T>();
// Get idx of current tile
auto batch_idx = _batch == 1 ? 0 : i / _seq_len;
auto seq_idx = _batch == 1 ? i : i % _seq_len;
ptrdiff_t idxa = batch_idx * _stride_batch_a + seq_idx * _stride_seq_a + _block_idx * _tile_len;
ptrdiff_t idxb = batch_idx * _stride_batch_b + seq_idx * _stride_seq_b + _block_idx * _tile_len;
// Copy process_th tile from global tensor to local tensor
DataCopy(aLocal, _a_gm[idxa], _copy_len);
DataCopy(bLocal, _b_gm[idxb], _copy_len);
// Enque input tensor to VECIN queue
_in_queue_a.EnQue(aLocal);
_in_queue_b.EnQue(bLocal);
}
template <typename T>
__aicore__ inline void SwigluKernel<T>::compute(size_t i) {
// Deque input tensors from VECIN queue
LocalTensor<T> aLocal = _in_queue_a.DeQue<T>();
LocalTensor<T> bLocal = _in_queue_b.DeQue<T>();
LocalTensor<T> cLocal = _out_queue_c.AllocTensor<T>();
// Call SwiGLU ascend api
SwiGLU<T, false>(cLocal, aLocal, bLocal, _beta_value, _copy_len);
// Enque result and free input
_out_queue_c.EnQue<T>(cLocal);
_in_queue_a.FreeTensor(aLocal);
_in_queue_b.FreeTensor(bLocal);
}
template <typename T>
__aicore__ inline void SwigluKernel<T>::copyOut(size_t i) {
// Deque output tensor from VECOUT queue
LocalTensor<T> cLocal = _out_queue_c.DeQue<T>();
auto batch_idx = _batch == 1 ? 0 : i / _seq_len;
auto seq_idx = _batch == 1 ? i : i % _seq_len;
ptrdiff_t idxc = batch_idx * _stride_batch_c + seq_idx * _stride_seq_c + _block_idx * _tile_len;
// Copy progress_th tile from local tensor to global tensor
if (_tile_len * sizeof(T) % BYTE_ALIGN != 0) {
DataCopyExtParams dcep = {1, static_cast<uint32_t>(_tile_len * sizeof(T)), 0, 0, 0};
DataCopyPad(_c_gm[idxc], cLocal, dcep);
} else {
DataCopy(_c_gm[idxc], cLocal, _tile_len);
}
// Free output Local tensor
_out_queue_c.FreeTensor(cLocal);
}
template <typename T>
__aicore__ inline void SwigluKernel<T>::process() {
for (size_t i = 0; i < _batch * _seq_len; ++i) {
copyIn(i);
compute(i);
copyOut(i);
}
}
#define DEFINE_SWIGLU_KERNEL(KERNEL_NAME, TYPE) \
__global__ __aicore__ void KERNEL_NAME(GM_ADDR c, GM_ADDR a, GM_ADDR b, \
size_t batch, size_t seq, size_t hd, \
ptrdiff_t stride_batch_c, \
ptrdiff_t stride_batch_a, \
ptrdiff_t stride_batch_b, \
ptrdiff_t stride_seq_c, \
ptrdiff_t stride_seq_a, \
ptrdiff_t stride_seq_b) { \
SwigluKernel<TYPE> op; \
op.init(c, a, b, \
batch, seq, hd, \
stride_batch_c, stride_batch_a, stride_batch_b, \
stride_seq_c, stride_seq_a, stride_seq_b); \
op.process(); \
}
DEFINE_SWIGLU_KERNEL(swiglu_kernel_half, half)
DEFINE_SWIGLU_KERNEL(swiglu_kernel_float, float)
#undef DEFINE_SWIGLU_KERNEL
extern "C" infiniStatus_t swiglu_kernel_launch(
void *c, void *a, void *b,
infiniDtype_t dtype, size_t batch, size_t seq, size_t hd,
ptrdiff_t stride_batch_c, ptrdiff_t stride_batch_a, ptrdiff_t stride_batch_b,
ptrdiff_t stride_seq_c, ptrdiff_t stride_seq_a, ptrdiff_t stride_seq_b, void *stream) {
#define LAUNCH_SWIGLU_KERNEL(DTYPE_ENUM, KERNEL_NAME) \
case DTYPE_ENUM: \
KERNEL_NAME<<<BLOCK_NUM, nullptr, stream>>>( \
c, a, b, \
batch, \
seq, \
hd, \
stride_batch_c, stride_batch_a, stride_batch_b, \
stride_seq_c, stride_seq_a, stride_seq_b); \
return INFINI_STATUS_SUCCESS;
switch (dtype) {
LAUNCH_SWIGLU_KERNEL(INFINI_DTYPE_F16, swiglu_kernel_half)
LAUNCH_SWIGLU_KERNEL(INFINI_DTYPE_F32, swiglu_kernel_float)
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
#undef LAUNCH_SWIGLU_KERNEL
}
......@@ -8,50 +8,41 @@ infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
infiniopTensorDescriptor_t up_desc,
infiniopTensorDescriptor_t gate_desc) {
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &up_desc = input_desc_vec.at(0);
const auto &gate_desc = input_desc_vec.at(1);
const auto &out_shape = out_desc->shape();
const auto &up_shape = up_desc->shape();
const auto &gate_shape = gate_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
if (!SAME_VEC(out_shape, up_shape, gate_shape)) {
return INFINI_STATUS_BAD_TENSOR_SHAPE;
}
op::binary::BinaryInfo info;
CHECK_STATUS(op::binary::createBinaryInfo(info, out_desc, up_desc, gate_desc));
CHECK_SAME_SHAPE(out_shape, up_shape, gate_shape);
// Create descriptor
*desc_ptr = new Descriptor(
dtype,
std::move(info),
nullptr,
handle->device,
handle->device_id);
// create CPU elementwise descriptor
CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *c,
const void *a,
const void *b,
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
switch (_dtype) {
case INFINI_DTYPE_F16:
op::common_cpu::binary_op::calculate<fp16_t, SwiGLUOp>(_info, c, a, b);
break;
return _device_info->calculate<SwiGLUOp, fp16_t>(_info, output, inputs, stream);
case INFINI_DTYPE_F32:
op::common_cpu::binary_op::calculate<float, SwiGLUOp>(_info, c, a, b);
break;
return _device_info->calculate<SwiGLUOp, float>(_info, output, inputs, stream);
case INFINI_DTYPE_F64:
op::common_cpu::binary_op::calculate<double, SwiGLUOp>(_info, c, a, b);
break;
return _device_info->calculate<SwiGLUOp, double>(_info, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
......
#ifndef __SWIGLU_CPU_H__
#define __SWIGLU_CPU_H__
#include "../../../binary/cpu/binary_cpu.h"
#include "../../../elementwise/cpu/elementwise_cpu.h"
BINARY_DESCRIPTOR(swiglu, cpu)
ELEMENTWISE_DESCRIPTOR(swiglu, cpu)
struct SwiGLUOp {
namespace op::swiglu::cpu {
typedef struct SwiGLUOp {
private:
template <typename T>
T sigmoid(const T &x) const {
return 1 / (1 + std::exp(-x));
return T(1) / (T(1) + std::exp(-x));
}
public:
static constexpr size_t num_inputs = 2;
template <typename T>
T operator()(const T &up, const T &gate) const {
return gate * sigmoid(gate) * up;
}
};
} SwiGLUOp;
} // namespace op::swiglu::cpu
#endif // __SWIGLU_CPU_H__
#include "swiglu_cuda.cuh"
#include "swiglu_cuda_internal.cuh"
namespace op::swiglu::cuda {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::cuda::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &up_desc = input_desc_vec.at(0);
const auto &gate_desc = input_desc_vec.at(1);
const auto &out_shape = out_desc->shape();
const auto &up_shape = up_desc->shape();
const auto &gate_shape = gate_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_SAME_SHAPE(out_shape, up_shape, gate_shape);
// create CUDA elementwise descriptor
CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, SwiGLUOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, SwiGLUOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, SwiGLUOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::swiglu::cuda
#ifndef __SWIGLU_CUDA_API_H__
#define __SWIGLU_CUDA_API_H__
#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
ELEMENTWISE_DESCRIPTOR(swiglu, cuda)
#endif // __SWIGLU_CUDA_API_H__
#ifndef __SWIGLU_CUDA_H__
#define __SWIGLU_CUDA_H__
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include <cuda_fp16.h>
namespace op::swiglu::cuda {
typedef struct SwiGLUOp {
private:
template <typename T>
__device__ __forceinline__ T sigmoid(const T &x) const {
if constexpr (std::is_same_v<T, half2>) {
return h2rcp(__hadd2(make_half2(1, 1), h2exp(__hneg2(x))));
} else if constexpr (std::is_same_v<T, half>) {
return hrcp(__hadd(half(1.f), __float2half(__expf(__half2float(__hneg(x))))));
} else if constexpr (std::is_same_v<T, float>) {
return __frcp_rn(__fadd_rn(1, __expf(-x)));
} else {
return 1 / (1 + std::exp(-x));
}
}
public:
static constexpr size_t num_inputs = 2;
template <typename T>
__device__ __forceinline__ T operator()(const T &up, const T &gate) const {
if constexpr (std::is_same_v<T, half2>) {
return __hmul2(__hmul2(gate, sigmoid(gate)), up);
} else if constexpr (std::is_same_v<T, half>) {
return __hmul(__hmul(gate, sigmoid(gate)), up);
} else if constexpr (std::is_same_v<T, float>) {
return __fmul_rn(__fmul_rn(gate, sigmoid(gate)), up);
} else {
return gate * sigmoid(gate) * up;
}
}
} SwiGLUOp;
} // namespace op::swiglu::cuda
#endif // __SWIGLU_CUDA_H__
#include "swiglu_kunlun.h"
// Op interface declare
LAUNCH_ELEMENTWISE_KERNEL(SwiGLU)
namespace op::swiglu::kunlun {
typedef struct SwiGLUOp {
static constexpr size_t num_inputs = 2;
template <typename Tdata, typename... Args>
static infiniStatus_t launch(Args... args) {
launchSwiGLUKernel<Tdata>(args...);
return INFINI_STATUS_SUCCESS;
}
} SwiGLUOp;
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::kunlun::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &up_desc = input_desc_vec.at(0);
const auto &gate_desc = input_desc_vec.at(1);
const auto &out_shape = out_desc->shape();
const auto &up_shape = up_desc->shape();
const auto &gate_shape = gate_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F32);
CHECK_SAME_SHAPE(out_shape, up_shape, gate_shape);
// create KUNLUN elementwise descriptor
CREATE_ELEMENTWISE_KUNLUN_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F32:
return _device_info->calculate<SwiGLUOp, float>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::swiglu::kunlun
#ifndef __SWIGLU_KUNLUN_H__
#define __SWIGLU_KUNLUN_H__
#include "../../../elementwise/kunlun/elementwise_kunlun.h"
ELEMENTWISE_DESCRIPTOR(swiglu, kunlun)
#endif // __SWIGLU_KUNLUN_H__
#ifndef __SWIGLU_KUNLUN_H__
#define __SWIGLU_KUNLUN_H__
#include "../../../devices/kunlun/kunlun_kernel_common.h"
#include "../../../elementwise/kunlun/elementwise_kunlun_kernel.h"
/// @brief Define swiglu op for local mem
typedef struct SwiGLUOp {
private:
template <typename T>
inline __device__ T sigmoid(T x) const {
return 1.0f / (1.0f + exp(-x));
}
public:
// This static number must be set in other Ops
static constexpr size_t num_inputs = 2;
template <typename T>
inline __device__ T operator()(const T *inputs) const {
T up = inputs[0];
T gate = inputs[1];
T out = gate * sigmoid(gate) * up;
return out;
}
} SwiGLUOp;
// Definition for swiglu kernel interface
LAUNCH_ELEMENTWISE_KERNEL_IMPL(SwiGLU, SwiGLUOp)
// Template instantiate
LAUNCH_ELEMENTWISE_KERNEL_INSTANTIATE(SwiGLU, float)
#endif // __SWIGLU_KUNLUN_H__
......@@ -5,6 +5,15 @@
#ifdef ENABLE_CPU_API
#include "cpu/swiglu_cpu.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/swiglu_cuda.cuh"
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/swiglu_kunlun.h"
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/swiglu_ascend.h"
#endif
__C infiniStatus_t infiniopCreateSwiGLUDescriptor(
infiniopHandle_t handle,
......@@ -19,19 +28,19 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
handle, \
reinterpret_cast<op::swiglu::NAMESPACE::Descriptor **>(desc_ptr), \
c_desc, \
a_desc, \
b_desc)
{a_desc, \
b_desc})
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu:
return cudaCreateSwiGLUDescriptor((CudaHandle_t)handle,
(SwiGLUCudaDescriptor_t *)desc_ptr,
c_desc, a_desc, b_desc);
#ifdef ENABLE_CUDA_API
CREATE(INFINI_DEVICE_NVIDIA, cuda);
#endif
#ifdef ENABLE_KUNLUN_API
CREATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
......@@ -40,11 +49,8 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
c_desc, a_desc, b_desc);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu:
return ascendCreateSwiGLUDescriptor(
(AscendHandle_t)handle, (SwiGLUAscendDescriptor_t *)desc_ptr,
c_desc, a_desc, b_desc);
#ifdef ENABLE_ASCEND_API
CREATE(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
......@@ -66,8 +72,52 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
#undef CREATE
}
__C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t desc, size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::swiglu::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu)
#endif
#ifdef ENABLE_CUDA_API
GET(INFINI_DEVICE_NVIDIA, cuda)
#endif
#ifdef ENABLE_KUNLUN_API
GET(INFINI_DEVICE_KUNLUN, kunlun)
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangGetSwiGLUWorkspaceSize((SwiGLUBangDescriptor_t)desc, size);
}
#endif
#ifdef ENABLE_ASCEND_API
GET(INFINI_DEVICE_ASCEND, ascend)
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaGetSwiGLUWorkspaceSize((SwiGLUMacaDescriptor_t)desc, size);
}
#endif
#ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: {
return musaGetSwiGLUWorkspaceSize((SwiGLUMusaDescriptor_t)desc, size);
}
#endif
}
#undef GET
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniStatus_t infiniopSwiGLU(
infiniopSwiGLUDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *c,
const void *a,
const void *b,
......@@ -76,25 +126,26 @@ __C infiniStatus_t infiniopSwiGLU(
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::swiglu::NAMESPACE::Descriptor *>(desc) \
->calculate(c, a, b, stream)
->calculate(workspace, workspace_size, c, {a, b}, stream)
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu:
return cudaSwiGLU((SwiGLUCudaDescriptor_t)desc, c, a, b, stream);
#ifdef ENABLE_CUDA_API
CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangSwiGLU((SwiGLUBangDescriptor_t)desc, c, a, b, stream);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu:
return ascendSwiGLU((SwiGLUAscendDescriptor_t)desc, c, a, b, stream);
#ifdef ENABLE_ASCEND_API
CALCULATE(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu:
......@@ -125,18 +176,19 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NV_GPU
case DevNvGpu:
return cudaDestroySwiGLUDescriptor((SwiGLUCudaDescriptor_t)desc);
#ifdef ENABLE_CUDA_API
DELETE(INFINI_DEVICE_NVIDIA, cuda);
#endif
#ifdef ENABLE_KUNLUN_API
DELETE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangDestroySwiGLUDescriptor((SwiGLUBangDescriptor_t)desc);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu:
return ascendDestroySwiGLUDescriptor((SwiGLUAscendDescriptor_t)desc);
#ifdef ENABLE_ASCEND_API
DELETE(INFINI_DEVICE_ASCEND, ascend)
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu:
......
......@@ -3,15 +3,22 @@
#include <cub/block/block_reduce.cuh>
/*
* Device functions for reduction operations on CUDA.
*
* Note: Only local result on thread 0 is guranteed to be correct.
* A manual broadcast is needed for other threads.
*/
namespace op::common_cuda::reduce_op {
// Sum(x^2) on contiguous data of length count
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
__device__ __forceinline__ Tcompute sumSquared(const Tdata *data_ptr, size_t count) {
Tcompute ss = 0;
// Each thread computes its partial sum
for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
ss += Tcompute(data_ptr[i] * data_ptr[i]);
ss += Tcompute(data_ptr[i]) * Tcompute(data_ptr[i]);
}
// Use CUB block-level reduction
......@@ -21,6 +28,36 @@ __device__ __forceinline__ Tcompute sumSquared(const Tdata *data_ptr, size_t cou
return BlockReduce(temp_storage).Sum(ss);
}
// Sum(x) on contiguous data of length count
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
__device__ __forceinline__ Tcompute sum(const Tdata *data_ptr, size_t count) {
Tcompute s = 0;
for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
s += Tcompute(data_ptr[i]);
}
using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
__shared__ typename BlockReduce::TempStorage temp_storage;
return BlockReduce(temp_storage).Sum(s);
}
// Max(x) on contiguous data of length count
template <unsigned int BLOCK_SIZE, typename Tdata>
__device__ __forceinline__ Tdata max(const Tdata *data_ptr, size_t count) {
Tdata max_ = data_ptr[0];
for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
max_ = cub::Max()(max_, data_ptr[i]);
}
using BlockReduce = cub::BlockReduce<Tdata, BLOCK_SIZE>;
__shared__ typename BlockReduce::TempStorage temp_storage;
return BlockReduce(temp_storage).Reduce(max_, cub::Max(), BLOCK_SIZE);
}
} // namespace op::common_cuda::reduce_op
#endif
#ifndef __INFINIOP_REDUCE_KUNLUN_H__
#define __INFINIOP_REDUCE_KUNLUN_H__
#include "../../devices/kunlun/kunlun_kernel_common.h"
namespace op::common_kunlun::reduce_op {
using namespace device::kunlun::kernel;
// Use 16 floats instruction to calculate reduce
// data_ptr is the pointer of LM
static inline __device__ float sumSquaredF32(float *data_ptr, int count) {
__local__ float acc_buf[16];
int remain = count % 16;
int offset_last = count - remain;
int mask = lowerBitMask(remain - 1);
// Load last 16 data
float32x16_t v_last = vload_lm_float32x16_mz((data_ptr + offset_last), mask);
// Do v_last * v_last
v_last = vvmul_float32x16(v_last, v_last);
// for every 16 float data
for (int i = 0; i < offset_last; i += 16) {
float32x16_t v_0 = vload_lm_float32x16_mz(data_ptr + i);
// Do v_0 * v_0
v_0 = vvmul_float32x16(v_0, v_0);
// Add to v_last
v_last = vvadd_float32x16(v_last, v_0);
}
vstore_lm_float32x16_mz(acc_buf, v_last);
mfence();
float res = 0.0f;
for (int i = 0; i < 16; ++i) {
res += acc_buf[i];
}
return res;
}
} // namespace op::common_kunlun::reduce_op
#endif
......@@ -2,9 +2,19 @@
#define __INFINIOP_TENSOR_H__
#include "infiniop/tensor_descriptor.h"
#include "../utils.h"
#include <string>
#include <vector>
#define TRANSFORM_TENSOR_DESC(__TENSOR_DESC__, __OP__) \
do { \
auto __RESULT__ = __TENSOR_DESC__->__OP__; \
CHECK_RESULT(__RESULT__); \
__TENSOR_DESC__ = __RESULT__.take(); \
} while (0)
struct InfiniopTensorDescriptor {
private:
// Datatype
......@@ -32,9 +42,9 @@ public:
bool hasBroadcastDim() const;
std::vector<size_t> getBroadcastDim() const;
infiniopTensorDescriptor_t dimMerge(size_t dim_start, size_t dim_end) const;
infiniopTensorDescriptor_t dimSplit(size_t axis, const std::vector<size_t> &dims) const;
infiniopTensorDescriptor_t dimPermute(const std::vector<size_t> &order) const;
utils::Result<infiniopTensorDescriptor_t> dimMerge(size_t dim_start, size_t dim_end) const;
utils::Result<infiniopTensorDescriptor_t> dimSplit(size_t axis, const std::vector<size_t> &dims) const;
utils::Result<infiniopTensorDescriptor_t> dimPermute(const std::vector<size_t> &order) const;
std::string toString() const;
};
......
......@@ -12,7 +12,7 @@ __C __export infiniStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescrip
std::vector<ptrdiff_t> strides(ndim);
ptrdiff_t dsize = 1;
if (ndim > 0) {
for (size_t i = ndim - 1; i >= 0; i--) {
for (int i = (int)ndim - 1; i >= 0; i--) {
strides[i] = dsize;
dsize *= shape_[i];
}
......@@ -104,10 +104,8 @@ std::vector<size_t> InfiniopTensorDescriptor::getBroadcastDim() const {
return res;
}
infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimMerge(size_t dim_start, size_t dim_end) const {
if (dim_start > dim_end || dim_end >= ndim()) {
return nullptr;
}
utils::Result<infiniopTensorDescriptor_t> InfiniopTensorDescriptor::dimMerge(size_t dim_start, size_t dim_end) const {
CHECK_OR_RETURN(dim_start <= dim_end && dim_end < ndim(), INFINI_STATUS_BAD_PARAM);
size_t new_ndim = ndim() - (dim_end - dim_start);
std::vector<size_t> new_shape(new_ndim);
......@@ -120,9 +118,7 @@ infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimMerge(size_t dim_start,
index++;
}
if (!isContiguous(dim_start, dim_end)) {
return nullptr;
}
CHECK_OR_RETURN(isContiguous(dim_start, dim_end), INFINI_STATUS_BAD_PARAM);
new_shape[index] = 1;
for (size_t i = dim_start; i <= dim_end; i++) {
......@@ -138,15 +134,15 @@ infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimMerge(size_t dim_start,
index++;
}
return new InfiniopTensorDescriptor(_dtype, new_ndim, new_shape.data(), new_strides.data());
return utils::Result<infiniopTensorDescriptor_t>(
new InfiniopTensorDescriptor(_dtype, new_ndim, new_shape.data(), new_strides.data()));
}
infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimSplit(size_t axis, const std::vector<size_t> &dims) const {
utils::Result<infiniopTensorDescriptor_t> InfiniopTensorDescriptor::dimSplit(size_t axis, const std::vector<size_t> &dims) const {
size_t ndim_ = ndim();
if (dim(axis) != std::accumulate(dims.begin(), dims.end(), (size_t)1, std::multiplies<size_t>())) {
return nullptr;
}
CHECK_OR_RETURN(dim(axis) == std::accumulate(dims.begin(), dims.end(), (size_t)1, std::multiplies<size_t>()),
INFINI_STATUS_BAD_PARAM);
size_t new_ndim = ndim_ + dims.size() - 1;
std::vector<size_t> new_shape(new_ndim);
......@@ -168,24 +164,22 @@ infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimSplit(size_t axis, const
index++;
}
return new InfiniopTensorDescriptor(_dtype, new_ndim, new_shape.data(), new_strides.data());
return utils::Result<infiniopTensorDescriptor_t>(
new InfiniopTensorDescriptor(_dtype, new_ndim, new_shape.data(), new_strides.data()));
}
infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimPermute(const std::vector<size_t> &order) const {
utils::Result<infiniopTensorDescriptor_t> InfiniopTensorDescriptor::dimPermute(const std::vector<size_t> &order) const {
auto ndim_ = ndim();
if (order.size() != ndim_) {
return nullptr;
}
CHECK_OR_RETURN(order.size() == ndim_, INFINI_STATUS_BAD_PARAM);
std::vector<size_t> new_shape(ndim_);
std::vector<ptrdiff_t> new_strides(ndim_);
for (size_t i = 0; i < ndim_; i++) {
if (std::find(order.begin(), order.end(), i) == order.end()) {
return nullptr;
}
CHECK_OR_RETURN(std::find(order.begin(), order.end(), i) != order.end(), INFINI_STATUS_BAD_PARAM);
new_shape[i] = dim(order[i]);
new_strides[i] = stride(order[i]);
}
return new InfiniopTensorDescriptor(_dtype, ndim_, new_shape.data(), new_strides.data());
return utils::Result<infiniopTensorDescriptor_t>(
new InfiniopTensorDescriptor(_dtype, ndim_, new_shape.data(), new_strides.data()));
}
std::string InfiniopTensorDescriptor::toString() const {
......
......@@ -6,7 +6,8 @@
namespace infinirt::bang {
infiniStatus_t getDeviceCount(int *count) {
CHECK_BANGRT(cnrtGetDeviceCount(count));
unsigned int device_count = static_cast<unsigned int>(*count);
CHECK_BANGRT(cnrtGetDeviceCount(&device_count));
return INFINI_STATUS_SUCCESS;
}
......@@ -22,7 +23,7 @@ infiniStatus_t deviceSynchronize() {
infiniStatus_t streamCreate(infinirtStream_t *stream_ptr) {
cnrtQueue_t queue;
CHECK_BANGRT(cnrtQueueCreate(&stream));
CHECK_BANGRT(cnrtQueueCreate(&queue));
*stream_ptr = queue;
return INFINI_STATUS_SUCCESS;
}
......@@ -55,7 +56,7 @@ infiniStatus_t eventRecord(infinirtEvent_t event, infinirtStream_t stream) {
}
infiniStatus_t eventQuery(infinirtEvent_t event, infinirtEventStatus_t *status_ptr) {
auto status = cnrtQueryNotifier((cnrtQueue_t)stream);
auto status = cnrtQueryNotifier((cnrtNotifier_t)event);
if (status == cnrtSuccess) {
*status_ptr = INFINIRT_EVENT_COMPLETE;
} else if (status == cnrtErrorBusy) {
......@@ -112,12 +113,12 @@ cnrtMemTransDir_t toBangMemcpyKind(infinirtMemcpyKind_t kind) {
}
infiniStatus_t memcpy(void *dst, const void *src, size_t size, infinirtMemcpyKind_t kind) {
CHECK_BANGRT(cnrtMemcpy(dst, src, size, toBangMemcpyKind(kind)));
CHECK_BANGRT(cnrtMemcpy(dst, (void *)src, size, toBangMemcpyKind(kind)));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t memcpyAsync(void *dst, const void *src, size_t size, infinirtMemcpyKind_t kind, infinirtStream_t stream) {
CHECK_BANGRT(cnrtMemcpyAsync_V2(dst, src, size, (cnrtQueue_t)stream, toBangMemcpyKind(kind)));
CHECK_BANGRT(cnrtMemcpyAsync_V2(dst, (void *)src, size, (cnrtQueue_t)stream, toBangMemcpyKind(kind)));
return INFINI_STATUS_SUCCESS;
}
......
......@@ -3,7 +3,7 @@
#include "../infinirt_impl.h"
namespace infinirt::bang {
#ifdef ENABLE_BANG_API
#ifdef ENABLE_CAMBRICON_API
INFINIRT_DEVICE_API_IMPL
#else
INFINIRT_DEVICE_API_NOOP
......
......@@ -4,6 +4,9 @@
#include "bang/infinirt_bang.h"
#include "cpu/infinirt_cpu.h"
#include "cuda/infinirt_cuda.cuh"
#include "kunlun/infinirt_kunlun.h"
#include "maca/infinirt_maca.h"
#include "musa/infinirt_musa.h"
thread_local infiniDevice_t CURRENT_DEVICE_TYPE = INFINI_DEVICE_CPU;
thread_local int CURRENT_DEVICE_ID = 0;
......@@ -42,40 +45,49 @@ __C infiniStatus_t infinirtGetDevice(infiniDevice_t *device_ptr, int *device_id_
return INFINI_STATUS_SUCCESS;
}
#define INFINIRT_CALL_DEVICE_API_AND(API, PARAMS, ACTION) \
{ \
infiniStatus_t _status; \
switch (CURRENT_DEVICE_TYPE) { \
case INFINI_DEVICE_CPU: \
_status = infinirt::cpu::API PARAMS; \
break; \
case INFINI_DEVICE_NVIDIA: \
_status = infinirt::cuda::API PARAMS; \
break; \
case INFINI_DEVICE_CAMBRICON: \
_status = infinirt::bang::API PARAMS; \
break; \
case INFINI_DEVICE_ASCEND: \
_status = infinirt::ascend::API PARAMS; \
break; \
default: \
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \
} \
{ ACTION; } \
return _status; \
#define INFINIRT_CALL_DEVICE_API_AND(DEVICE_TYPE, API, PARAMS, ACTION) \
{ \
infiniStatus_t _status; \
switch (DEVICE_TYPE) { \
case INFINI_DEVICE_CPU: \
_status = infinirt::cpu::API PARAMS; \
break; \
case INFINI_DEVICE_NVIDIA: \
_status = infinirt::cuda::API PARAMS; \
break; \
case INFINI_DEVICE_CAMBRICON: \
_status = infinirt::bang::API PARAMS; \
break; \
case INFINI_DEVICE_ASCEND: \
_status = infinirt::ascend::API PARAMS; \
break; \
case INFINI_DEVICE_METAX: \
_status = infinirt::maca::API PARAMS; \
break; \
case INFINI_DEVICE_MOORE: \
_status = infinirt::musa::API PARAMS; \
break; \
case INFINI_DEVICE_KUNLUN: \
_status = infinirt::kunlun::API PARAMS; \
break; \
default: \
_status = INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \
} \
{ ACTION; } \
return _status; \
}
#define INFINIRT_CALL_DEVICE_API(API, PARAMS) INFINIRT_CALL_DEVICE_API_AND(API, PARAMS, )
#define INFINIRT_CALL_DEVICE_API(API, PARAMS) INFINIRT_CALL_DEVICE_API_AND(CURRENT_DEVICE_TYPE, API, PARAMS, )
__C infiniStatus_t infinirtGetDeviceCount(infiniDevice_t device, int *count) {
if (count == nullptr) {
return INFINI_STATUS_NULL_POINTER;
}
INFINIRT_CALL_DEVICE_API(getDeviceCount, (count));
INFINIRT_CALL_DEVICE_API_AND(device, getDeviceCount, (count), {});
}
__C infiniStatus_t infinirtSetDevice(infiniDevice_t device, int device_id) {
INFINIRT_CALL_DEVICE_API_AND(setDevice, (device_id),
INFINIRT_CALL_DEVICE_API_AND(device, setDevice, (device_id),
{ CURRENT_DEVICE_TYPE = device;
CURRENT_DEVICE_ID = device_id; });
}
......
......@@ -98,4 +98,14 @@ inline std::string infiniDtypeToString(infiniDtype_t dtype) {
}
}
#define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
namespace utils {
inline size_t align(size_t size, size_t alignment) {
return (size + alignment - 1) & ~(alignment - 1);
}
} // namespace utils
#endif
......@@ -3,6 +3,16 @@
#include <iostream>
#include <tuple>
#define CHECK_OR_RETURN(CONDITION, ERROR) \
do { \
if (!(CONDITION)) { \
std::cerr << "Check Failed: `(" << #CONDITION << ")` is False" \
<< " from " << __func__ \
<< " at " << __FILE__ << ":" << __LINE__ << std::endl; \
return ERROR; \
} \
} while (0)
#define CHECK_API_OR(API, EXPECT, ACTION) \
do { \
auto api_result_ = (API); \
......@@ -31,13 +41,22 @@
return INFINI_STATUS_BAD_TENSOR_DTYPE); \
} while (0)
#define SAME_VEC(...) \
[&] { \
auto &&_vec = std::forward_as_tuple(__VA_ARGS__); \
const auto &_base = std::get<0>(_vec); \
return [&_base](auto &&...args) { \
return ((args == _base) && ...); \
}(__VA_ARGS__); \
}()
#define CHECK_DTYPE_ANY_INT(DT) \
CHECK_DTYPE(DT, \
INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64, \
INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
#define CHECK_SAME_VEC(ERR, FIRST, ...) \
do { \
for (const auto &shape___ : {__VA_ARGS__}) { \
if (FIRST != shape___) { \
return ERR; \
} \
} \
} while (0)
#define CHECK_SAME_SHAPE(FIRST, ...) CHECK_SAME_VEC(INFINI_STATUS_BAD_TENSOR_SHAPE, FIRST, __VA_ARGS__)
#define CHECK_SAME_STRIDES(FIRST, ...) CHECK_SAME_VEC(INFINI_STATUS_BAD_TENSOR_STRIDES, FIRST, __VA_ARGS__)
#endif // INFINIUTILS_CHECK_H
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment