Commit 9b32b4b1 authored by Catheriany's avatar Catheriany
Browse files

Merge remote-tracking branch 'origin/main' into issue/150

parents 15bcbdfc 4799ddbf
#ifndef __ACLNN_SWIGLU_H__
#define __ACLNN_SWIGLU_H__
#include "../../../../utils.h"
#include "../../../../utils/check.h"
#include "../../../operator.h"
#include "../../../tensor.h"
namespace op::swiglu::ascend {
class SwigluInfo {
SwigluInfo() = default;
public:
infiniDtype_t dtype;
std::vector<size_t> shape;
int32_t ndim;
std::vector<ptrdiff_t> c_strides;
std::vector<ptrdiff_t> a_strides;
std::vector<ptrdiff_t> b_strides;
static utils::Result<SwigluInfo> create(infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t b_desc) {
CHECK_OR_RETURN(c_desc && a_desc && b_desc, INFINI_STATUS_BAD_PARAM);
CHECK_OR_RETURN(!c_desc->hasBroadcastDim(), INFINI_STATUS_BAD_TENSOR_STRIDES);
CHECK_OR_RETURN(c_desc->ndim() == a_desc->ndim()
&& c_desc->ndim() == b_desc->ndim()
&& (c_desc->ndim() == 2 || c_desc->ndim() == 3),
INFINI_STATUS_BAD_TENSOR_SHAPE);
CHECK_SAME_SHAPE(c_desc->shape(), a_desc->shape(), b_desc->shape());
int32_t ndim = c_desc->ndim();
CHECK_OR_RETURN(c_desc->stride(ndim - 1) == 1
&& a_desc->stride(ndim - 1) == 1
&& b_desc->stride(ndim - 1) == 1,
INFINI_STATUS_BAD_TENSOR_STRIDES);
CHECK_OR_RETURN(c_desc->dtype() == a_desc->dtype()
&& c_desc->dtype() == b_desc->dtype(),
INFINI_STATUS_BAD_TENSOR_DTYPE);
return utils::Result<SwigluInfo>(SwigluInfo{
c_desc->dtype(),
c_desc->shape(),
ndim,
c_desc->strides(),
a_desc->strides(),
b_desc->strides(),
});
}
};
class Descriptor final : public InfiniopDescriptor {
SwigluInfo _info;
size_t _workspace_size;
Descriptor(SwigluInfo info, size_t workspace_size, infiniDevice_t device_type, int device_id) : InfiniopDescriptor{device_type, device_id},
_info(info), _workspace_size(workspace_size) {}
public:
~Descriptor();
static infiniStatus_t create(infiniopHandle_t handle, Descriptor **desc_ptr,
infiniopTensorDescriptor_t c_desc,
std::vector<infiniopTensorDescriptor_t> input_descs);
size_t workspaceSize() const { return _workspace_size; }
infiniStatus_t calculate(
void *workspace,
size_t workspace_size,
void *c,
std::vector<const void *> inputs,
void *stream) const;
};
extern "C" infiniStatus_t swiglu_kernel_launch(
void *c, void *a, void *b,
infiniDtype_t dtype, size_t batch, size_t seq, size_t hd,
ptrdiff_t stride_batch_c, ptrdiff_t stride_batch_a, ptrdiff_t stride_batch_b,
ptrdiff_t stride_seq_c, ptrdiff_t stride_seq_a, ptrdiff_t stride_seq_b, void *stream);
} // namespace op::swiglu::ascend
#endif // __ACLNN_SWIGLU_H__
#include "../../../devices/ascend/ascend_kernel_common.h"
using namespace AscendC;
template <typename T>
class SwigluKernel {
public:
__aicore__ inline SwigluKernel() {}
__aicore__ inline void init(GM_ADDR c, GM_ADDR a, GM_ADDR b,
size_t batch_, size_t seq, size_t hd,
ptrdiff_t stride_batch_c,
ptrdiff_t stride_batch_a,
ptrdiff_t stride_batch_b,
ptrdiff_t stride_seq_c,
ptrdiff_t stride_seq_a,
ptrdiff_t stride_seq_b);
__aicore__ inline void process();
private:
__aicore__ inline void copyIn(size_t i);
__aicore__ inline void compute(size_t i);
__aicore__ inline void copyOut(size_t i);
private:
GlobalTensor<T> _c_gm, _a_gm, _b_gm;
TQue<QuePosition::VECIN, BUFFER_NUM> _in_queue_a, _in_queue_b;
TQue<QuePosition::VECOUT, BUFFER_NUM> _out_queue_c;
TPipe _pipe;
float _beta_value = 1.0f;
size_t _block_idx, _tile_len, _copy_len,
_batch, _seq_len, _hidden_size,
_stride_seq_a, _stride_seq_b, _stride_seq_c;
int64_t _stride_batch_a = 1, _stride_batch_b = 1, _stride_batch_c = 1;
};
template <typename T>
__aicore__ inline void SwigluKernel<T>::init(GM_ADDR c, GM_ADDR a, GM_ADDR b,
size_t batch_, size_t seq, size_t hd,
ptrdiff_t stride_batch_c,
ptrdiff_t stride_batch_a,
ptrdiff_t stride_batch_b,
ptrdiff_t stride_seq_c,
ptrdiff_t stride_seq_a,
ptrdiff_t stride_seq_b) {
// Init Shape & StrideVariables
_batch = batch_;
_seq_len = seq;
_hidden_size = hd;
_stride_batch_a = stride_batch_a;
_stride_batch_b = stride_batch_b;
_stride_batch_c = stride_batch_c;
_stride_seq_a = stride_seq_a;
_stride_seq_b = stride_seq_b;
_stride_seq_c = stride_seq_c;
_block_idx = GetBlockIdx();
_tile_len = _block_idx < (_hidden_size % BLOCK_NUM) ? (_hidden_size / BLOCK_NUM) + 1 : (_hidden_size / BLOCK_NUM);
_copy_len = alignTileLen<T>(_tile_len, BYTE_ALIGN);
// Set global tensor
_a_gm.SetGlobalBuffer((__gm__ T *)a);
_b_gm.SetGlobalBuffer((__gm__ T *)b);
_c_gm.SetGlobalBuffer((__gm__ T *)c);
// _pipe alloc memory to queue, the unit is bytes
_pipe.InitBuffer(_in_queue_a, BUFFER_NUM, _copy_len * sizeof(T));
_pipe.InitBuffer(_in_queue_b, BUFFER_NUM, _copy_len * sizeof(T));
_pipe.InitBuffer(_out_queue_c, BUFFER_NUM, _copy_len * sizeof(T));
}
template <typename T>
__aicore__ inline void SwigluKernel<T>::copyIn(size_t i) {
// Alloc tensor from queue memory
LocalTensor<T> aLocal = _in_queue_a.AllocTensor<T>();
LocalTensor<T> bLocal = _in_queue_b.AllocTensor<T>();
// Get idx of current tile
auto batch_idx = _batch == 1 ? 0 : i / _seq_len;
auto seq_idx = _batch == 1 ? i : i % _seq_len;
ptrdiff_t idxa = batch_idx * _stride_batch_a + seq_idx * _stride_seq_a + _block_idx * _tile_len;
ptrdiff_t idxb = batch_idx * _stride_batch_b + seq_idx * _stride_seq_b + _block_idx * _tile_len;
// Copy process_th tile from global tensor to local tensor
DataCopy(aLocal, _a_gm[idxa], _copy_len);
DataCopy(bLocal, _b_gm[idxb], _copy_len);
// Enque input tensor to VECIN queue
_in_queue_a.EnQue(aLocal);
_in_queue_b.EnQue(bLocal);
}
template <typename T>
__aicore__ inline void SwigluKernel<T>::compute(size_t i) {
// Deque input tensors from VECIN queue
LocalTensor<T> aLocal = _in_queue_a.DeQue<T>();
LocalTensor<T> bLocal = _in_queue_b.DeQue<T>();
LocalTensor<T> cLocal = _out_queue_c.AllocTensor<T>();
// Call SwiGLU ascend api
SwiGLU<T, false>(cLocal, aLocal, bLocal, _beta_value, _copy_len);
// Enque result and free input
_out_queue_c.EnQue<T>(cLocal);
_in_queue_a.FreeTensor(aLocal);
_in_queue_b.FreeTensor(bLocal);
}
template <typename T>
__aicore__ inline void SwigluKernel<T>::copyOut(size_t i) {
// Deque output tensor from VECOUT queue
LocalTensor<T> cLocal = _out_queue_c.DeQue<T>();
auto batch_idx = _batch == 1 ? 0 : i / _seq_len;
auto seq_idx = _batch == 1 ? i : i % _seq_len;
ptrdiff_t idxc = batch_idx * _stride_batch_c + seq_idx * _stride_seq_c + _block_idx * _tile_len;
// Copy progress_th tile from local tensor to global tensor
if (_tile_len * sizeof(T) % BYTE_ALIGN != 0) {
DataCopyExtParams dcep = {1, static_cast<uint32_t>(_tile_len * sizeof(T)), 0, 0, 0};
DataCopyPad(_c_gm[idxc], cLocal, dcep);
} else {
DataCopy(_c_gm[idxc], cLocal, _tile_len);
}
// Free output Local tensor
_out_queue_c.FreeTensor(cLocal);
}
template <typename T>
__aicore__ inline void SwigluKernel<T>::process() {
for (size_t i = 0; i < _batch * _seq_len; ++i) {
copyIn(i);
compute(i);
copyOut(i);
}
}
#define DEFINE_SWIGLU_KERNEL(KERNEL_NAME, TYPE) \
__global__ __aicore__ void KERNEL_NAME(GM_ADDR c, GM_ADDR a, GM_ADDR b, \
size_t batch, size_t seq, size_t hd, \
ptrdiff_t stride_batch_c, \
ptrdiff_t stride_batch_a, \
ptrdiff_t stride_batch_b, \
ptrdiff_t stride_seq_c, \
ptrdiff_t stride_seq_a, \
ptrdiff_t stride_seq_b) { \
SwigluKernel<TYPE> op; \
op.init(c, a, b, \
batch, seq, hd, \
stride_batch_c, stride_batch_a, stride_batch_b, \
stride_seq_c, stride_seq_a, stride_seq_b); \
op.process(); \
}
DEFINE_SWIGLU_KERNEL(swiglu_kernel_half, half)
DEFINE_SWIGLU_KERNEL(swiglu_kernel_float, float)
#undef DEFINE_SWIGLU_KERNEL
extern "C" infiniStatus_t swiglu_kernel_launch(
void *c, void *a, void *b,
infiniDtype_t dtype, size_t batch, size_t seq, size_t hd,
ptrdiff_t stride_batch_c, ptrdiff_t stride_batch_a, ptrdiff_t stride_batch_b,
ptrdiff_t stride_seq_c, ptrdiff_t stride_seq_a, ptrdiff_t stride_seq_b, void *stream) {
#define LAUNCH_SWIGLU_KERNEL(DTYPE_ENUM, KERNEL_NAME) \
case DTYPE_ENUM: \
KERNEL_NAME<<<BLOCK_NUM, nullptr, stream>>>( \
c, a, b, \
batch, \
seq, \
hd, \
stride_batch_c, stride_batch_a, stride_batch_b, \
stride_seq_c, stride_seq_a, stride_seq_b); \
return INFINI_STATUS_SUCCESS;
switch (dtype) {
LAUNCH_SWIGLU_KERNEL(INFINI_DTYPE_F16, swiglu_kernel_half)
LAUNCH_SWIGLU_KERNEL(INFINI_DTYPE_F32, swiglu_kernel_float)
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
#undef LAUNCH_SWIGLU_KERNEL
}
#include "swiglu_kunlun.h"
// Op interface declare
LAUNCH_ELEMENTWISE_KERNEL(SwiGLU)
namespace op::swiglu::kunlun {
typedef struct SwiGLUOp {
static constexpr size_t num_inputs = 2;
template <typename Tdata, typename... Args>
static infiniStatus_t launch(Args... args) {
launchSwiGLUKernel<Tdata>(args...);
return INFINI_STATUS_SUCCESS;
}
} SwiGLUOp;
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::kunlun::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &up_desc = input_desc_vec.at(0);
const auto &gate_desc = input_desc_vec.at(1);
const auto &out_shape = out_desc->shape();
const auto &up_shape = up_desc->shape();
const auto &gate_shape = gate_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F32);
CHECK_SAME_SHAPE(out_shape, up_shape, gate_shape);
// create KUNLUN elementwise descriptor
CREATE_ELEMENTWISE_KUNLUN_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F32:
return _device_info->calculate<SwiGLUOp, float>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::swiglu::kunlun
#ifndef __SWIGLU_KUNLUN_H__
#define __SWIGLU_KUNLUN_H__
#include "../../../elementwise/kunlun/elementwise_kunlun.h"
ELEMENTWISE_DESCRIPTOR(swiglu, kunlun)
#endif // __SWIGLU_KUNLUN_H__
#ifndef __SWIGLU_KUNLUN_H__
#define __SWIGLU_KUNLUN_H__
#include "../../../devices/kunlun/kunlun_kernel_common.h"
#include "../../../elementwise/kunlun/elementwise_kunlun_kernel.h"
/// @brief Define swiglu op for local mem
typedef struct SwiGLUOp {
private:
template <typename T>
inline __device__ T sigmoid(T x) const {
return 1.0f / (1.0f + exp(-x));
}
public:
// This static number must be set in other Ops
static constexpr size_t num_inputs = 2;
template <typename T>
inline __device__ T operator()(const T *inputs) const {
T up = inputs[0];
T gate = inputs[1];
T out = gate * sigmoid(gate) * up;
return out;
}
} SwiGLUOp;
// Definition for swiglu kernel interface
LAUNCH_ELEMENTWISE_KERNEL_IMPL(SwiGLU, SwiGLUOp)
// Template instantiate
LAUNCH_ELEMENTWISE_KERNEL_INSTANTIATE(SwiGLU, float)
#endif // __SWIGLU_KUNLUN_H__
#ifndef __SWIGLU_MACA_API_H__
#define __SWIGLU_MACA_API_H__
#include "../../../elementwise/maca/elementwise_maca_api.h"
ELEMENTWISE_DESCRIPTOR(swiglu, maca)
#endif // __SWIGLU_MACA_API_H__
#include "swiglu_maca.h"
#include "swiglu_maca_internal.h"
namespace op::swiglu::maca {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::maca::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &up_desc = input_desc_vec.at(0);
const auto &gate_desc = input_desc_vec.at(1);
const auto &out_shape = out_desc->shape();
const auto &up_shape = up_desc->shape();
const auto &gate_shape = gate_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_SAME_SHAPE(out_shape, up_shape, gate_shape);
// create MACA elementwise descriptor
CREATE_ELEMENTWISE_MACA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, SwiGLUOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, SwiGLUOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, SwiGLUOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::swiglu::maca
#ifndef __SWIGLU_MACA_H__
#define __SWIGLU_MACA_H__
#include "../../../elementwise/maca/elementwise_maca.h"
#include <hctlass/half.h>
namespace op::swiglu::maca {
typedef struct SwiGLUOp {
private:
template <typename T>
__device__ __forceinline__ T sigmoid(const T &x) const {
if constexpr (std::is_same_v<T, half2>) {
return h2rcp(__hadd2(make_half2(1, 1), h2exp(__hneg2(x))));
} else if constexpr (std::is_same_v<T, half>) {
return hrcp(__hadd(half(1.f), __float2half(__expf(__half2float(__hneg(x))))));
} else if constexpr (std::is_same_v<T, float>) {
return __frcp_rn(__fadd_rn(1, __expf(-x)));
} else {
return 1 / (1 + std::exp(-x));
}
}
public:
static constexpr size_t num_inputs = 2;
template <typename T>
__device__ __forceinline__ T operator()(const T &up, const T &gate) const {
if constexpr (std::is_same_v<T, half2>) {
return __hmul2(__hmul2(gate, sigmoid(gate)), up);
} else if constexpr (std::is_same_v<T, half>) {
return __hmul(__hmul(gate, sigmoid(gate)), up);
} else if constexpr (std::is_same_v<T, float>) {
return __fmul_rn(__fmul_rn(gate, sigmoid(gate)), up);
} else {
return gate * sigmoid(gate) * up;
}
}
} SwiGLUOp;
} // namespace op::swiglu::maca
#endif
...@@ -8,6 +8,15 @@ ...@@ -8,6 +8,15 @@
#ifdef ENABLE_CUDA_API #ifdef ENABLE_CUDA_API
#include "cuda/swiglu_cuda.cuh" #include "cuda/swiglu_cuda.cuh"
#endif #endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/swiglu_kunlun.h"
#endif
#ifdef ENABLE_METAX_API
#include "maca/swiglu_maca.h"
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/swiglu_ascend.h"
#endif
__C infiniStatus_t infiniopCreateSwiGLUDescriptor( __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
infiniopHandle_t handle, infiniopHandle_t handle,
...@@ -33,6 +42,12 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor( ...@@ -33,6 +42,12 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
#ifdef ENABLE_CUDA_API #ifdef ENABLE_CUDA_API
CREATE(INFINI_DEVICE_NVIDIA, cuda); CREATE(INFINI_DEVICE_NVIDIA, cuda);
#endif #endif
#ifdef ENABLE_KUNLUN_API
CREATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, maca);
#endif
#ifdef ENABLE_CAMBRICON_MLU #ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: { case DevCambriconMlu: {
return bangCreateSwiGLUDescriptor((BangHandle_t)handle, return bangCreateSwiGLUDescriptor((BangHandle_t)handle,
...@@ -40,11 +55,8 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor( ...@@ -40,11 +55,8 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
c_desc, a_desc, b_desc); c_desc, a_desc, b_desc);
} }
#endif #endif
#ifdef ENABLE_ASCEND_NPU #ifdef ENABLE_ASCEND_API
case DevAscendNpu: CREATE(INFINI_DEVICE_ASCEND, ascend);
return ascendCreateSwiGLUDescriptor(
(AscendHandle_t)handle, (SwiGLUAscendDescriptor_t *)desc_ptr,
c_desc, a_desc, b_desc);
#endif #endif
#ifdef ENABLE_METAX_GPU #ifdef ENABLE_METAX_GPU
case DevMetaxGpu: { case DevMetaxGpu: {
...@@ -80,12 +92,18 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des ...@@ -80,12 +92,18 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
#ifdef ENABLE_CUDA_API #ifdef ENABLE_CUDA_API
GET(INFINI_DEVICE_NVIDIA, cuda) GET(INFINI_DEVICE_NVIDIA, cuda)
#endif #endif
#ifdef ENABLE_KUNLUN_API
GET(INFINI_DEVICE_KUNLUN, kunlun)
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, maca);
#endif
#ifdef ENABLE_CAMBRICON_MLU #ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: { case DevCambriconMlu: {
return bangGetSwiGLUWorkspaceSize((SwiGLUBangDescriptor_t)desc, size); return bangGetSwiGLUWorkspaceSize((SwiGLUBangDescriptor_t)desc, size);
} }
#endif #endif
#ifdef ENABLE_ASCEND_NPU #ifdef ENABLE_ASCEND_API
GET(INFINI_DEVICE_ASCEND, ascend) GET(INFINI_DEVICE_ASCEND, ascend)
#endif #endif
#ifdef ENABLE_METAX_GPU #ifdef ENABLE_METAX_GPU
...@@ -127,14 +145,19 @@ __C infiniStatus_t infiniopSwiGLU( ...@@ -127,14 +145,19 @@ __C infiniStatus_t infiniopSwiGLU(
#ifdef ENABLE_CUDA_API #ifdef ENABLE_CUDA_API
CALCULATE(INFINI_DEVICE_NVIDIA, cuda); CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
#endif #endif
#ifdef ENABLE_KUNLUN_API
CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, maca);
#endif
#ifdef ENABLE_CAMBRICON_MLU #ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: { case DevCambriconMlu: {
return bangSwiGLU((SwiGLUBangDescriptor_t)desc, c, a, b, stream); return bangSwiGLU((SwiGLUBangDescriptor_t)desc, c, a, b, stream);
} }
#endif #endif
#ifdef ENABLE_ASCEND_NPU #ifdef ENABLE_ASCEND_API
case DevAscendNpu: CALCULATE(INFINI_DEVICE_ASCEND, ascend);
return ascendSwiGLU((SwiGLUAscendDescriptor_t)desc, c, a, b, stream);
#endif #endif
#ifdef ENABLE_METAX_GPU #ifdef ENABLE_METAX_GPU
case DevMetaxGpu: case DevMetaxGpu:
...@@ -168,14 +191,19 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) { ...@@ -168,14 +191,19 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
#ifdef ENABLE_CUDA_API #ifdef ENABLE_CUDA_API
DELETE(INFINI_DEVICE_NVIDIA, cuda); DELETE(INFINI_DEVICE_NVIDIA, cuda);
#endif #endif
#ifdef ENABLE_KUNLUN_API
DELETE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, maca);
#endif
#ifdef ENABLE_CAMBRICON_MLU #ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: { case DevCambriconMlu: {
return bangDestroySwiGLUDescriptor((SwiGLUBangDescriptor_t)desc); return bangDestroySwiGLUDescriptor((SwiGLUBangDescriptor_t)desc);
} }
#endif #endif
#ifdef ENABLE_ASCEND_NPU #ifdef ENABLE_ASCEND_API
case DevAscendNpu: DELETE(INFINI_DEVICE_ASCEND, ascend)
return ascendDestroySwiGLUDescriptor((SwiGLUAscendDescriptor_t)desc);
#endif #endif
#ifdef ENABLE_METAX_GPU #ifdef ENABLE_METAX_GPU
case DevMetaxGpu: case DevMetaxGpu:
......
...@@ -18,7 +18,7 @@ __device__ __forceinline__ Tcompute sumSquared(const Tdata *data_ptr, size_t cou ...@@ -18,7 +18,7 @@ __device__ __forceinline__ Tcompute sumSquared(const Tdata *data_ptr, size_t cou
// Each thread computes its partial sum // Each thread computes its partial sum
for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) { for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
ss += Tcompute(data_ptr[i] * data_ptr[i]); ss += Tcompute(data_ptr[i]) * Tcompute(data_ptr[i]);
} }
// Use CUB block-level reduction // Use CUB block-level reduction
......
#ifndef __INFINIOP_REDUCE_KUNLUN_H__ #ifndef __INFINIOP_REDUCE_KUNLUN_H__
#define __INFINIOP_REDUCE_KUNLUN_H__ #define __INFINIOP_REDUCE_KUNLUN_H__
#include "../../devices/kunlun/kunlun_common.h" #include "../../devices/kunlun/kunlun_kernel_common.h"
namespace op::common_kunlun::reduce_op { namespace op::common_kunlun::reduce_op {
using namespace device::kunlun::kernel;
// Use 16 floats instruction to calculate reduce // Use 16 floats instruction to calculate reduce
// data_ptr is the pointer of LM // data_ptr is the pointer of LM
static inline __device__ float sumSquaredF32(float *data_ptr, int count) { static inline __device__ float sumSquaredF32(float *data_ptr, int count) {
......
...@@ -2,9 +2,19 @@ ...@@ -2,9 +2,19 @@
#define __INFINIOP_TENSOR_H__ #define __INFINIOP_TENSOR_H__
#include "infiniop/tensor_descriptor.h" #include "infiniop/tensor_descriptor.h"
#include "../utils.h"
#include <string> #include <string>
#include <vector> #include <vector>
#define TRANSFORM_TENSOR_DESC(__TENSOR_DESC__, __OP__) \
do { \
auto __RESULT__ = __TENSOR_DESC__->__OP__; \
CHECK_RESULT(__RESULT__); \
__TENSOR_DESC__ = __RESULT__.take(); \
} while (0)
struct InfiniopTensorDescriptor { struct InfiniopTensorDescriptor {
private: private:
// Datatype // Datatype
...@@ -32,9 +42,9 @@ public: ...@@ -32,9 +42,9 @@ public:
bool hasBroadcastDim() const; bool hasBroadcastDim() const;
std::vector<size_t> getBroadcastDim() const; std::vector<size_t> getBroadcastDim() const;
infiniopTensorDescriptor_t dimMerge(size_t dim_start, size_t dim_end) const; utils::Result<infiniopTensorDescriptor_t> dimMerge(size_t dim_start, size_t dim_end) const;
infiniopTensorDescriptor_t dimSplit(size_t axis, const std::vector<size_t> &dims) const; utils::Result<infiniopTensorDescriptor_t> dimSplit(size_t axis, const std::vector<size_t> &dims) const;
infiniopTensorDescriptor_t dimPermute(const std::vector<size_t> &order) const; utils::Result<infiniopTensorDescriptor_t> dimPermute(const std::vector<size_t> &order) const;
std::string toString() const; std::string toString() const;
}; };
......
...@@ -12,7 +12,7 @@ __C __export infiniStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescrip ...@@ -12,7 +12,7 @@ __C __export infiniStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescrip
std::vector<ptrdiff_t> strides(ndim); std::vector<ptrdiff_t> strides(ndim);
ptrdiff_t dsize = 1; ptrdiff_t dsize = 1;
if (ndim > 0) { if (ndim > 0) {
for (size_t i = ndim - 1; i >= 0; i--) { for (int i = (int)ndim - 1; i >= 0; i--) {
strides[i] = dsize; strides[i] = dsize;
dsize *= shape_[i]; dsize *= shape_[i];
} }
...@@ -104,10 +104,8 @@ std::vector<size_t> InfiniopTensorDescriptor::getBroadcastDim() const { ...@@ -104,10 +104,8 @@ std::vector<size_t> InfiniopTensorDescriptor::getBroadcastDim() const {
return res; return res;
} }
infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimMerge(size_t dim_start, size_t dim_end) const { utils::Result<infiniopTensorDescriptor_t> InfiniopTensorDescriptor::dimMerge(size_t dim_start, size_t dim_end) const {
if (dim_start > dim_end || dim_end >= ndim()) { CHECK_OR_RETURN(dim_start <= dim_end && dim_end < ndim(), INFINI_STATUS_BAD_PARAM);
return nullptr;
}
size_t new_ndim = ndim() - (dim_end - dim_start); size_t new_ndim = ndim() - (dim_end - dim_start);
std::vector<size_t> new_shape(new_ndim); std::vector<size_t> new_shape(new_ndim);
...@@ -120,9 +118,7 @@ infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimMerge(size_t dim_start, ...@@ -120,9 +118,7 @@ infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimMerge(size_t dim_start,
index++; index++;
} }
if (!isContiguous(dim_start, dim_end)) { CHECK_OR_RETURN(isContiguous(dim_start, dim_end), INFINI_STATUS_BAD_PARAM);
return nullptr;
}
new_shape[index] = 1; new_shape[index] = 1;
for (size_t i = dim_start; i <= dim_end; i++) { for (size_t i = dim_start; i <= dim_end; i++) {
...@@ -138,15 +134,15 @@ infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimMerge(size_t dim_start, ...@@ -138,15 +134,15 @@ infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimMerge(size_t dim_start,
index++; index++;
} }
return new InfiniopTensorDescriptor(_dtype, new_ndim, new_shape.data(), new_strides.data()); return utils::Result<infiniopTensorDescriptor_t>(
new InfiniopTensorDescriptor(_dtype, new_ndim, new_shape.data(), new_strides.data()));
} }
infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimSplit(size_t axis, const std::vector<size_t> &dims) const { utils::Result<infiniopTensorDescriptor_t> InfiniopTensorDescriptor::dimSplit(size_t axis, const std::vector<size_t> &dims) const {
size_t ndim_ = ndim(); size_t ndim_ = ndim();
if (dim(axis) != std::accumulate(dims.begin(), dims.end(), (size_t)1, std::multiplies<size_t>())) { CHECK_OR_RETURN(dim(axis) == std::accumulate(dims.begin(), dims.end(), (size_t)1, std::multiplies<size_t>()),
return nullptr; INFINI_STATUS_BAD_PARAM);
}
size_t new_ndim = ndim_ + dims.size() - 1; size_t new_ndim = ndim_ + dims.size() - 1;
std::vector<size_t> new_shape(new_ndim); std::vector<size_t> new_shape(new_ndim);
...@@ -168,24 +164,22 @@ infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimSplit(size_t axis, const ...@@ -168,24 +164,22 @@ infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimSplit(size_t axis, const
index++; index++;
} }
return new InfiniopTensorDescriptor(_dtype, new_ndim, new_shape.data(), new_strides.data()); return utils::Result<infiniopTensorDescriptor_t>(
new InfiniopTensorDescriptor(_dtype, new_ndim, new_shape.data(), new_strides.data()));
} }
infiniopTensorDescriptor_t InfiniopTensorDescriptor::dimPermute(const std::vector<size_t> &order) const { utils::Result<infiniopTensorDescriptor_t> InfiniopTensorDescriptor::dimPermute(const std::vector<size_t> &order) const {
auto ndim_ = ndim(); auto ndim_ = ndim();
if (order.size() != ndim_) { CHECK_OR_RETURN(order.size() == ndim_, INFINI_STATUS_BAD_PARAM);
return nullptr;
}
std::vector<size_t> new_shape(ndim_); std::vector<size_t> new_shape(ndim_);
std::vector<ptrdiff_t> new_strides(ndim_); std::vector<ptrdiff_t> new_strides(ndim_);
for (size_t i = 0; i < ndim_; i++) { for (size_t i = 0; i < ndim_; i++) {
if (std::find(order.begin(), order.end(), i) == order.end()) { CHECK_OR_RETURN(std::find(order.begin(), order.end(), i) != order.end(), INFINI_STATUS_BAD_PARAM);
return nullptr;
}
new_shape[i] = dim(order[i]); new_shape[i] = dim(order[i]);
new_strides[i] = stride(order[i]); new_strides[i] = stride(order[i]);
} }
return new InfiniopTensorDescriptor(_dtype, ndim_, new_shape.data(), new_strides.data()); return utils::Result<infiniopTensorDescriptor_t>(
new InfiniopTensorDescriptor(_dtype, ndim_, new_shape.data(), new_strides.data()));
} }
std::string InfiniopTensorDescriptor::toString() const { std::string InfiniopTensorDescriptor::toString() const {
......
...@@ -6,7 +6,8 @@ ...@@ -6,7 +6,8 @@
namespace infinirt::bang { namespace infinirt::bang {
infiniStatus_t getDeviceCount(int *count) { infiniStatus_t getDeviceCount(int *count) {
CHECK_BANGRT(cnrtGetDeviceCount(count)); unsigned int device_count = static_cast<unsigned int>(*count);
CHECK_BANGRT(cnrtGetDeviceCount(&device_count));
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
...@@ -22,7 +23,7 @@ infiniStatus_t deviceSynchronize() { ...@@ -22,7 +23,7 @@ infiniStatus_t deviceSynchronize() {
infiniStatus_t streamCreate(infinirtStream_t *stream_ptr) { infiniStatus_t streamCreate(infinirtStream_t *stream_ptr) {
cnrtQueue_t queue; cnrtQueue_t queue;
CHECK_BANGRT(cnrtQueueCreate(&stream)); CHECK_BANGRT(cnrtQueueCreate(&queue));
*stream_ptr = queue; *stream_ptr = queue;
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
...@@ -55,7 +56,7 @@ infiniStatus_t eventRecord(infinirtEvent_t event, infinirtStream_t stream) { ...@@ -55,7 +56,7 @@ infiniStatus_t eventRecord(infinirtEvent_t event, infinirtStream_t stream) {
} }
infiniStatus_t eventQuery(infinirtEvent_t event, infinirtEventStatus_t *status_ptr) { infiniStatus_t eventQuery(infinirtEvent_t event, infinirtEventStatus_t *status_ptr) {
auto status = cnrtQueryNotifier((cnrtQueue_t)stream); auto status = cnrtQueryNotifier((cnrtNotifier_t)event);
if (status == cnrtSuccess) { if (status == cnrtSuccess) {
*status_ptr = INFINIRT_EVENT_COMPLETE; *status_ptr = INFINIRT_EVENT_COMPLETE;
} else if (status == cnrtErrorBusy) { } else if (status == cnrtErrorBusy) {
...@@ -112,12 +113,12 @@ cnrtMemTransDir_t toBangMemcpyKind(infinirtMemcpyKind_t kind) { ...@@ -112,12 +113,12 @@ cnrtMemTransDir_t toBangMemcpyKind(infinirtMemcpyKind_t kind) {
} }
infiniStatus_t memcpy(void *dst, const void *src, size_t size, infinirtMemcpyKind_t kind) { infiniStatus_t memcpy(void *dst, const void *src, size_t size, infinirtMemcpyKind_t kind) {
CHECK_BANGRT(cnrtMemcpy(dst, src, size, toBangMemcpyKind(kind))); CHECK_BANGRT(cnrtMemcpy(dst, (void *)src, size, toBangMemcpyKind(kind)));
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
infiniStatus_t memcpyAsync(void *dst, const void *src, size_t size, infinirtMemcpyKind_t kind, infinirtStream_t stream) { infiniStatus_t memcpyAsync(void *dst, const void *src, size_t size, infinirtMemcpyKind_t kind, infinirtStream_t stream) {
CHECK_BANGRT(cnrtMemcpyAsync_V2(dst, src, size, (cnrtQueue_t)stream, toBangMemcpyKind(kind))); CHECK_BANGRT(cnrtMemcpyAsync_V2(dst, (void *)src, size, (cnrtQueue_t)stream, toBangMemcpyKind(kind)));
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include "../infinirt_impl.h" #include "../infinirt_impl.h"
namespace infinirt::bang { namespace infinirt::bang {
#ifdef ENABLE_BANG_API #ifdef ENABLE_CAMBRICON_API
INFINIRT_DEVICE_API_IMPL INFINIRT_DEVICE_API_IMPL
#else #else
INFINIRT_DEVICE_API_NOOP INFINIRT_DEVICE_API_NOOP
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include "bang/infinirt_bang.h" #include "bang/infinirt_bang.h"
#include "cpu/infinirt_cpu.h" #include "cpu/infinirt_cpu.h"
#include "cuda/infinirt_cuda.cuh" #include "cuda/infinirt_cuda.cuh"
#include "kunlun/infinirt_kunlun.h"
#include "maca/infinirt_maca.h" #include "maca/infinirt_maca.h"
#include "musa/infinirt_musa.h" #include "musa/infinirt_musa.h"
...@@ -66,8 +67,11 @@ __C infiniStatus_t infinirtGetDevice(infiniDevice_t *device_ptr, int *device_id_ ...@@ -66,8 +67,11 @@ __C infiniStatus_t infinirtGetDevice(infiniDevice_t *device_ptr, int *device_id_
case INFINI_DEVICE_MOORE: \ case INFINI_DEVICE_MOORE: \
_status = infinirt::musa::API PARAMS; \ _status = infinirt::musa::API PARAMS; \
break; \ break; \
case INFINI_DEVICE_KUNLUN: \
_status = infinirt::kunlun::API PARAMS; \
break; \
default: \ default: \
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ _status = INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \
} \ } \
{ ACTION; } \ { ACTION; } \
return _status; \ return _status; \
......
...@@ -100,4 +100,12 @@ inline std::string infiniDtypeToString(infiniDtype_t dtype) { ...@@ -100,4 +100,12 @@ inline std::string infiniDtypeToString(infiniDtype_t dtype) {
#define CEIL_DIV(x, y) (((x) + (y)-1) / (y)) #define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
namespace utils {
inline size_t align(size_t size, size_t alignment) {
return (size + alignment - 1) & ~(alignment - 1);
}
} // namespace utils
#endif #endif
...@@ -43,7 +43,7 @@ fp16_t _f32_to_f16(float val) { ...@@ -43,7 +43,7 @@ fp16_t _f32_to_f16(float val) {
int32_t exponent = ((f32 >> 23) & 0xFF) - 127; // Extract and de-bias the exponent int32_t exponent = ((f32 >> 23) & 0xFF) - 127; // Extract and de-bias the exponent
uint32_t mantissa = f32 & 0x7FFFFF; // Extract the mantissa (fraction part) uint32_t mantissa = f32 & 0x7FFFFF; // Extract the mantissa (fraction part)
if (exponent >= 31) { // Special cases for Inf and NaN if (exponent >= 16) { // Special cases for Inf and NaN
// NaN // NaN
if (exponent == 128 && mantissa != 0) { if (exponent == 128 && mantissa != 0) {
return fp16_t{static_cast<uint16_t>(sign | 0x7E00)}; return fp16_t{static_cast<uint16_t>(sign | 0x7E00)};
......
from .infiniop_test import InfiniopTestCase, InfiniopTestWriter, np_dtype_to_ggml, gguf_strides from .infiniop_test import InfiniopTestCase, InfiniopTestWriter, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
...@@ -29,6 +29,14 @@ def gguf_strides(*args: int) -> list[int] | None: ...@@ -29,6 +29,14 @@ def gguf_strides(*args: int) -> list[int] | None:
return list(args)[::-1] if args else None return list(args)[::-1] if args else None
def contiguous_gguf_strides(shape: tuple[int, ...]) -> list[int]:
strides = []
acc = 1
for size in reversed(shape):
strides.append(acc)
acc *= size
return strides[::-1]
class InfiniopTestCase: class InfiniopTestCase:
op_name: str op_name: str
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment