Commit b2b4526b authored by wooway777's avatar wooway777
Browse files

issue/216 - Merge remote-tracking branch main into issue/216

parents 3c696932 0c803397
...@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.16.0) ...@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.16.0)
# project information # project information
project(Ascend_C) project(Ascend_C)
set(SOC_VERSION "Ascend910B3" CACHE STRING "system on chip type") set(SOC_VERSION "Ascend910B3" CACHE STRING "system on chip type")
set(ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_HOME} CACHE PATH "ASCEND CANN package installation directory") set(ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_TOOLKIT_HOME} CACHE PATH "ASCEND CANN package installation directory")
set(RUN_MODE "npu" CACHE STRING "run mode: npu") set(RUN_MODE "npu" CACHE STRING "run mode: npu")
set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE)
...@@ -19,10 +19,14 @@ else() ...@@ -19,10 +19,14 @@ else()
endif() endif()
include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
include_directories(
${CMAKE_SOURCE_DIR}/../../../../include/infiniop/
)
ascendc_library(ascend_kernels STATIC ascendc_library(ascend_kernels STATIC
../../ops/swiglu/ascend/swiglu_kernel.cpp ../../ops/swiglu/ascend/swiglu_ascend_kernel.cpp
../../ops/rotary_embedding/ascend/rotary_embedding_kernel.cpp # ../../ops/rotary_embedding/ascend/rotary_embedding_kernel.cpp
../../ops/random_sample/ascend/random_sample_kernel.cpp # ../../ops/random_sample/ascend/random_sample_kernel.cpp
) )
#ifndef __INFINIOP_ASCEND_KERNEL_COMMON_H__
#define __INFINIOP_ASCEND_KERNEL_COMMON_H__
#include "../../../../include/infinicore.h"
#include "kernel_operator.h"
constexpr int32_t BLOCK_NUM = 8;
constexpr int32_t BUFFER_NUM = 2;
constexpr int32_t BYTE_ALIGN = 32;
#endif
#include "common_ascend.h" #include "common_ascend.h"
std::vector<int64_t> inferStorageShape(std::vector<int64_t> shape, std::vector<int64_t> strides) { std::vector<int64_t> inferStorageShape(std::vector<int64_t> shape, std::vector<int64_t> strides) {
auto index = std::max_element(strides.begin(), strides.end()); if (shape.size() != strides.size()) {
uint64_t max_stride_index = std::distance(strides.begin(), index); throw std::invalid_argument("Shape and strides must have the same length.");
auto storageShape = std::vector<int64_t>({shape[max_stride_index] * strides[max_stride_index]}); }
int64_t max_offset = 0;
for (size_t i = 0; i < shape.size(); ++i) {
max_offset += (shape[i] - 1) * strides[i];
}
return storageShape; // storage shape is 1D buffer that must cover all accessed elements
return {max_offset + 1};
} }
size_t aclnnTensorDescriptor::numel() const { size_t aclnnTensorDescriptor::numel() const {
...@@ -18,7 +24,7 @@ aclnnTensorDescriptor::aclnnTensorDescriptor(infiniopTensorDescriptor_t desc, vo ...@@ -18,7 +24,7 @@ aclnnTensorDescriptor::aclnnTensorDescriptor(infiniopTensorDescriptor_t desc, vo
this->strides = std::vector<int64_t>(ndim); this->strides = std::vector<int64_t>(ndim);
for (uint64_t i = 0; i < ndim; ++i) { for (uint64_t i = 0; i < ndim; ++i) {
this->shape[i] = static_cast<int64_t>(desc->dim(i)); this->shape[i] = static_cast<int64_t>(desc->dim(i));
this->strides[i] = desc->stride(i); this->strides[i] = static_cast<int64_t>(desc->stride(i));
} }
this->storageShape = inferStorageShape(this->shape, this->strides); this->storageShape = inferStorageShape(this->shape, this->strides);
this->dataType = toAclDataType(desc->dtype()); this->dataType = toAclDataType(desc->dtype());
......
#include "causal_softmax_aclnn.h" #include "causal_softmax_ascend.h"
#include "../../../devices/ascend/common_ascend.h" #include "../../../devices/ascend/common_ascend.h"
#include <aclnnop/aclnn_masked_fill_tensor.h> #include <aclnnop/aclnn_masked_fill_tensor.h>
#include <aclnnop/aclnn_softmax.h> #include <aclnnop/aclnn_softmax.h>
...@@ -12,6 +12,8 @@ struct Descriptor::Opaque { ...@@ -12,6 +12,8 @@ struct Descriptor::Opaque {
aclnnTensorDescriptor_t value; aclnnTensorDescriptor_t value;
void *mask_addr; void *mask_addr;
void *value_addr; void *value_addr;
uint64_t workspacesize;
aclOpExecutor *executor;
~Opaque() { ~Opaque() {
delete x; delete x;
...@@ -21,6 +23,9 @@ struct Descriptor::Opaque { ...@@ -21,6 +23,9 @@ struct Descriptor::Opaque {
aclrtFree(mask_addr); aclrtFree(mask_addr);
aclrtFree(value_addr); aclrtFree(value_addr);
// Delete useless executor
aclDestroyAclOpExecutor(executor);
} }
}; };
...@@ -92,18 +97,18 @@ infiniStatus_t Descriptor::create( ...@@ -92,18 +97,18 @@ infiniStatus_t Descriptor::create(
aclTensor *tvalue = value->tensor; aclTensor *tvalue = value->tensor;
CHECK_ACL(aclnnInplaceMaskedFillTensorGetWorkspaceSize(tx, tmask, tvalue, &workspacesize_mask, &mask_executor)); CHECK_ACL(aclnnInplaceMaskedFillTensorGetWorkspaceSize(tx, tmask, tvalue, &workspacesize_mask, &mask_executor));
int64_t dim = 2;
int64_t dim = 2;
CHECK_ACL(aclnnSoftmaxGetWorkspaceSize(tx, dim, ty, &workspacesize_softmax, &executor)); CHECK_ACL(aclnnSoftmaxGetWorkspaceSize(tx, dim, ty, &workspacesize_softmax, &executor));
// set executor reusable
aclSetAclOpExecutorRepeatable(executor);
// Create the descriptor // Create the descripto
size_t all_workspacesize = workspacesize_softmax + workspacesize_mask; size_t all_workspacesize = std::max(workspacesize_softmax, workspacesize_mask);
*desc_ptr = new Descriptor(new Opaque{x, mask, y, value, mask_addr, value_addr},
std::move(info), all_workspacesize, handle_ascend->device, handle_ascend->device_id);
// Delete useless executor *desc_ptr = new Descriptor(new Opaque{x, mask, y, value, mask_addr, value_addr,
aclDestroyAclOpExecutor(executor); workspacesize_softmax, executor},
aclDestroyAclOpExecutor(mask_executor); std::move(info), all_workspacesize, handle_ascend->device, handle_ascend->device_id);
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
...@@ -116,23 +121,18 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, voi ...@@ -116,23 +121,18 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, voi
auto ty = _opaque->y->tensor; auto ty = _opaque->y->tensor;
auto tmask = _opaque->mask->tensor; auto tmask = _opaque->mask->tensor;
auto tvalue = _opaque->value->tensor; auto tvalue = _opaque->value->tensor;
aclOpExecutor *executor = nullptr;
aclOpExecutor *mask_executor = nullptr; aclOpExecutor *mask_executor = nullptr;
size_t workspacesize_softmax = 0;
size_t workspacesize_mask = 0; size_t workspacesize_mask = 0;
int64_t dim = 2;
AclSetTensorAddr(mask_executor, 0, tx, (void *)x); AclSetTensorAddr(mask_executor, 0, tx, (void *)x);
AclSetTensorAddr(mask_executor, 1, tmask, _opaque->mask_addr); AclSetTensorAddr(mask_executor, 1, tmask, _opaque->mask_addr);
AclSetTensorAddr(mask_executor, 2, tvalue, _opaque->value_addr); AclSetTensorAddr(mask_executor, 2, tvalue, _opaque->value_addr);
CHECK_ACL(aclnnInplaceMaskedFillTensorGetWorkspaceSize(tx, tmask, tvalue, &workspacesize_mask, &mask_executor)); CHECK_ACL(aclnnInplaceMaskedFillTensorGetWorkspaceSize(tx, tmask, tvalue, &workspacesize_mask, &mask_executor));
CHECK_ACL(aclnnInplaceMaskedFillTensor(workspace, workspacesize_mask, mask_executor, stream)); CHECK_ACL(aclnnInplaceMaskedFillTensor(workspace, workspacesize_mask, mask_executor, stream));
CHECK_ACL(aclrtSynchronizeStream(stream));
AclSetTensorAddr(executor, 0, tx, (void *)x); AclSetTensorAddr(_opaque->executor, 0, tx, (void *)x);
AclSetTensorAddr(executor, 1, ty, y); AclSetTensorAddr(_opaque->executor, 1, ty, y);
CHECK_ACL(aclnnSoftmaxGetWorkspaceSize(tx, dim, ty, &workspacesize_softmax, &executor)); CHECK_ACL(aclnnSoftmax(workspace, _opaque->workspacesize, _opaque->executor, stream));
CHECK_ACL(aclnnSoftmax(workspace, workspacesize_softmax, executor, stream));
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
#include "cuda/causal_softmax_cuda.cuh" #include "cuda/causal_softmax_cuda.cuh"
#endif #endif
#ifdef ENABLE_ASCEND_API #ifdef ENABLE_ASCEND_API
#include "ascend/causal_softmax_aclnn.h" #include "ascend/causal_softmax_ascend.h"
#endif #endif
__C infiniStatus_t infiniopCreateCausalSoftmaxDescriptor( __C infiniStatus_t infiniopCreateCausalSoftmaxDescriptor(
......
#include "rearrange_ascend.h"
#include "../../../devices/ascend/common_ascend.h"
#include <aclnnop/aclnn_copy.h>
namespace op::rearrange::ascend {
struct Descriptor::Opaque {
aclnnTensorDescriptor_t dst;
aclnnTensorDescriptor_t src;
void *workspace; // aclnnInplaceCopy workspace
uint64_t workspace_size;
~Opaque() {
delete dst;
delete src;
aclrtFree(workspace);
}
};
Descriptor::~Descriptor() {
delete _opaque;
};
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc) {
auto handle = reinterpret_cast<device::ascend::Handle *>(handle_);
auto dtype = y_desc->dtype();
auto ndim = y_desc->ndim();
auto shape = y_desc->shape();
CHECK_API_OR(x_desc->dtype(), dtype, return INFINI_STATUS_BAD_TENSOR_DTYPE);
CHECK_API_OR(x_desc->ndim(), ndim, return INFINI_STATUS_BAD_TENSOR_SHAPE);
for (size_t i = 0; i < ndim; ++i) {
CHECK_API_OR(x_desc->shape()[i], shape[i], return INFINI_STATUS_BAD_TENSOR_SHAPE);
}
auto dst_strides = y_desc->strides();
auto src_strides = x_desc->strides();
auto element_size = infiniSizeOf(dtype);
auto result = utils::RearrangeMeta::create(shape.data(), dst_strides.data(), src_strides.data(), ndim, element_size);
CHECK_RESULT(result);
aclnnTensorDescriptor_t dst = new aclnnTensorDescriptor(y_desc);
aclnnTensorDescriptor_t src = new aclnnTensorDescriptor(x_desc);
uint64_t workspace_size = 0;
aclOpExecutor *executor = nullptr;
void *workspace = nullptr;
aclnnInplaceCopyGetWorkspaceSize(dst->tensor, src->tensor,
&workspace_size, &executor);
if (workspace_size != 0) {
CHECK_ACL(aclrtMalloc(&workspace, workspace_size, ACL_MEM_MALLOC_HUGE_FIRST));
}
*desc_ptr = new Descriptor(
result.take(),
new Opaque{
dst,
src,
workspace,
workspace_size},
handle->device,
handle->device_id);
// Delete useless executor
aclDestroyAclOpExecutor(executor);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *y,
const void *x,
void *stream) const {
auto tdst = _opaque->dst->tensor;
auto tsrc = _opaque->src->tensor;
uint64_t workspace_size = 0;
aclOpExecutor *executor = nullptr;
AclSetTensorAddr(executor, 0, tdst, y);
AclSetTensorAddr(executor, 1, tsrc, (void *)x);
CHECK_ACL(aclnnInplaceCopyGetWorkspaceSize(tdst, tsrc, &workspace_size, &executor));
// Execute InplaceCopy
CHECK_ACL(aclnnInplaceCopy(_opaque->workspace, _opaque->workspace_size,
executor, stream));
return INFINI_STATUS_SUCCESS;
}
} // namespace op::rearrange::ascend
#ifndef __REARRANGE_ASCEND_H__
#define __REARRANGE_ASCNED_H__
#include "../rearrange.h"
DESCRIPTOR(ascend)
#endif // __REARRANGE_ASCEND_H__
...@@ -5,6 +5,9 @@ ...@@ -5,6 +5,9 @@
#ifdef ENABLE_CPU_API #ifdef ENABLE_CPU_API
#include "cpu/rearrange_cpu.h" #include "cpu/rearrange_cpu.h"
#endif #endif
#ifdef ENABLE_ASCEND_API
#include "ascend/rearrange_ascend.h"
#endif
#ifdef ENABLE_CUDA_API #ifdef ENABLE_CUDA_API
#include "cuda/rearrange_cuda.cuh" #include "cuda/rearrange_cuda.cuh"
...@@ -29,6 +32,9 @@ __C infiniStatus_t infiniopCreateRearrangeDescriptor( ...@@ -29,6 +32,9 @@ __C infiniStatus_t infiniopCreateRearrangeDescriptor(
#ifdef ENABLE_CPU_API #ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu); CREATE(INFINI_DEVICE_CPU, cpu);
#endif #endif
#ifdef ENABLE_ASCEND_API
CREATE(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_CUDA_API #ifdef ENABLE_CUDA_API
CREATE(INFINI_DEVICE_NVIDIA, cuda); CREATE(INFINI_DEVICE_NVIDIA, cuda);
...@@ -57,6 +63,9 @@ __C infiniStatus_t infiniopRearrange( ...@@ -57,6 +63,9 @@ __C infiniStatus_t infiniopRearrange(
#ifdef ENABLE_CPU_API #ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu); CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif #endif
#ifdef ENABLE_ASCEND_API
CALCULATE(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_CUDA_API #ifdef ENABLE_CUDA_API
CALCULATE(INFINI_DEVICE_NVIDIA, cuda); CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
...@@ -82,6 +91,9 @@ __C infiniStatus_t infiniopDestroyRearrangeDescriptor( ...@@ -82,6 +91,9 @@ __C infiniStatus_t infiniopDestroyRearrangeDescriptor(
#ifdef ENABLE_CPU_API #ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu); DELETE(INFINI_DEVICE_CPU, cpu);
#endif #endif
#ifdef ENABLE_ASCEND_API
DELETE(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_CUDA_API #ifdef ENABLE_CUDA_API
DELETE(INFINI_DEVICE_NVIDIA, cuda); DELETE(INFINI_DEVICE_NVIDIA, cuda);
......
...@@ -10,12 +10,15 @@ struct Descriptor::Opaque { ...@@ -10,12 +10,15 @@ struct Descriptor::Opaque {
aclnnTensorDescriptor_t w; aclnnTensorDescriptor_t w;
aclnnTensorDescriptor_t rstd; aclnnTensorDescriptor_t rstd;
size_t workspaceSize; size_t workspaceSize;
aclOpExecutor *executor;
~Opaque() { ~Opaque() {
delete y; delete y;
delete x; delete x;
delete w; delete w;
delete rstd; delete rstd;
aclDestroyAclOpExecutor(executor);
} }
}; };
...@@ -62,17 +65,16 @@ infiniStatus_t Descriptor::create( ...@@ -62,17 +65,16 @@ infiniStatus_t Descriptor::create(
// Get WorkspaceSize and set executor // Get WorkspaceSize and set executor
CHECK_ACL(aclnnRmsNormGetWorkspaceSize(tx, tw, static_cast<double>(epsilon), ty, trstd, &workspace_size, &executor)); CHECK_ACL(aclnnRmsNormGetWorkspaceSize(tx, tw, static_cast<double>(epsilon), ty, trstd, &workspace_size, &executor));
aclSetAclOpExecutorRepeatable(executor);
auto handle_ascend = reinterpret_cast<device::ascend::Handle *>(handle); auto handle_ascend = reinterpret_cast<device::ascend::Handle *>(handle);
size_t all_workspace_size = workspace_size + rstd->numel() * aclDataTypeSize(rstd->dataType); size_t all_workspace_size = workspace_size + rstd->numel() * aclDataTypeSize(rstd->dataType);
*desc_ptr = new Descriptor( *desc_ptr = new Descriptor(
new Opaque{y, x, w, rstd, workspace_size}, new Opaque{y, x, w, rstd, workspace_size, executor},
std::move(info), std::move(info),
all_workspace_size, all_workspace_size,
handle_ascend->device, handle_ascend->device_id); handle_ascend->device, handle_ascend->device_id);
aclDestroyAclOpExecutor(executor);
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
...@@ -88,21 +90,16 @@ infiniStatus_t Descriptor::calculate( ...@@ -88,21 +90,16 @@ infiniStatus_t Descriptor::calculate(
auto tx = _opaque->x->tensor; auto tx = _opaque->x->tensor;
auto ty = _opaque->y->tensor; auto ty = _opaque->y->tensor;
auto trstd = _opaque->rstd->tensor; auto trstd = _opaque->rstd->tensor;
size_t workspace_size_ = 0;
aclOpExecutor *executor = nullptr;
CHECK_ACL(aclnnRmsNormGetWorkspaceSize(tx, tw, static_cast<double>(_info.epsilon), ty, trstd, &workspace_size_, &executor));
CHECK_ACL(aclSetAclOpExecutorRepeatable(executor));
void *rstdPtr = (void *)((uint8_t *)workspace + _opaque->workspaceSize); void *rstdPtr = (void *)((uint8_t *)workspace + _opaque->workspaceSize);
auto unit = infiniSizeOf(_info.atype); auto unit = infiniSizeOf(_info.atype);
AclSetTensorAddr(executor, 1, tw, (void *)w); AclSetTensorAddr(_opaque->executor, 1, tw, (void *)w);
AclSetTensorAddr(executor, 3, trstd, rstdPtr); AclSetTensorAddr(_opaque->executor, 3, trstd, rstdPtr);
for (size_t i = 0; i < (_info.shape)[0]; ++i) { for (size_t i = 0; i < (_info.shape)[0]; ++i) {
AclSetTensorAddr(executor, 0, tx, ((char *)x) + i * (_info.x_strides)[0] * unit); AclSetTensorAddr(_opaque->executor, 0, tx, ((char *)x) + i * (_info.x_strides)[0] * unit);
AclSetTensorAddr(executor, 2, ty, ((char *)y) + i * (_info.y_strides)[0] * unit); AclSetTensorAddr(_opaque->executor, 2, ty, ((char *)y) + i * (_info.y_strides)[0] * unit);
CHECK_ACL(aclnnRmsNorm(workspace, _opaque->workspaceSize, executor, stream)); CHECK_ACL(aclnnRmsNorm(workspace, _opaque->workspaceSize, _opaque->executor, stream));
} }
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
......
#include "swiglu_ascend.h"
#include "../../../devices/ascend/common_ascend.h"
namespace op::swiglu::ascend {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(infiniopHandle_t handle, Descriptor **desc_ptr,
infiniopTensorDescriptor_t c_desc,
std::vector<infiniopTensorDescriptor_t> input_descs) {
auto handle_ascend = reinterpret_cast<device::ascend::Handle *>(handle);
auto dtype = c_desc->dtype();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
const auto &a_desc = input_descs[0];
const auto &b_desc = input_descs[1];
auto result = SwigluInfo::create(c_desc, a_desc, b_desc);
CHECK_RESULT(result);
SwigluInfo info = result.take();
// https://www.hiascend.com/document/detail/zh/canncommercial/800/apiref/ascendcopapi/atlasascendc_api_07_0777.html
size_t workspace_size = 0;
*desc_ptr = new Descriptor(std::move(info), workspace_size, handle_ascend->device, handle_ascend->device_id);
return INFINI_STATUS_SUCCESS;
}
extern "C" infiniStatus_t swiglu_kernel_launch(
void *c, void *a, void *b,
infiniDtype_t dtype, size_t batch, size_t seq, size_t hd,
ptrdiff_t stride_batch_c, ptrdiff_t stride_batch_a, ptrdiff_t stride_batch_b,
ptrdiff_t stride_seq_c, ptrdiff_t stride_seq_a, ptrdiff_t stride_seq_b, void *stream);
infiniStatus_t Descriptor::calculate(void *workspace,
size_t workspace_size,
void *c,
std::vector<const void *> inputs,
void *stream) const {
auto batch = _info.ndim == 2 ? 1 : _info.shape[0];
auto seq_len = _info.ndim == 2 ? _info.shape[0] : _info.shape[1];
auto hidden_size = _info.shape[_info.ndim - 1];
auto stride_batch_c = _info.ndim == 2 ? 1 : _info.c_strides[0];
auto stride_batch_a = _info.ndim == 2 ? 1 : _info.a_strides[0];
auto stride_batch_b = _info.ndim == 2 ? 1 : _info.b_strides[0];
auto stride_seq_c = _info.ndim == 2 ? _info.c_strides[0] : _info.c_strides[1];
auto stride_seq_a = _info.ndim == 2 ? _info.a_strides[0] : _info.a_strides[1];
auto stride_seq_b = _info.ndim == 2 ? _info.b_strides[0] : _info.b_strides[1];
auto status = swiglu_kernel_launch(c, (void *)inputs[0], (void *)inputs[1], _info.dtype, batch, seq_len, hidden_size, stride_batch_c, stride_batch_a, stride_batch_b, stride_seq_c, stride_seq_a, stride_seq_b, stream);
return status;
}
} // namespace op::swiglu::ascend
#ifndef __ACLNN_SWIGLU_H__
#define __ACLNN_SWIGLU_H__
#include "../../../../utils.h"
#include "../../../../utils/check.h"
#include "../../../operator.h"
#include "../../../tensor.h"
namespace op::swiglu::ascend {
class SwigluInfo {
SwigluInfo() = default;
public:
infiniDtype_t dtype;
std::vector<size_t> shape;
int32_t ndim;
std::vector<ptrdiff_t> c_strides;
std::vector<ptrdiff_t> a_strides;
std::vector<ptrdiff_t> b_strides;
static utils::Result<SwigluInfo> create(infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t b_desc) {
CHECK_OR_RETURN(c_desc && a_desc && b_desc, INFINI_STATUS_BAD_PARAM);
CHECK_OR_RETURN(!c_desc->hasBroadcastDim(), INFINI_STATUS_BAD_TENSOR_STRIDES);
CHECK_OR_RETURN(c_desc->ndim() == a_desc->ndim()
&& c_desc->ndim() == b_desc->ndim()
&& (c_desc->ndim() == 2 || c_desc->ndim() == 3),
INFINI_STATUS_BAD_TENSOR_SHAPE);
CHECK_SAME_SHAPE(c_desc->shape(), a_desc->shape(), b_desc->shape());
int32_t ndim = c_desc->ndim();
CHECK_OR_RETURN(c_desc->stride(ndim - 1) == 1
&& a_desc->stride(ndim - 1) == 1
&& b_desc->stride(ndim - 1) == 1,
INFINI_STATUS_BAD_TENSOR_STRIDES);
CHECK_OR_RETURN(c_desc->dtype() == a_desc->dtype()
&& c_desc->dtype() == b_desc->dtype(),
INFINI_STATUS_BAD_TENSOR_DTYPE);
return utils::Result<SwigluInfo>(SwigluInfo{
c_desc->dtype(),
c_desc->shape(),
ndim,
c_desc->strides(),
a_desc->strides(),
b_desc->strides(),
});
}
};
class Descriptor final : public InfiniopDescriptor {
SwigluInfo _info;
size_t _workspace_size;
Descriptor(SwigluInfo info, size_t workspace_size, infiniDevice_t device_type, int device_id) : InfiniopDescriptor{device_type, device_id},
_info(info), _workspace_size(workspace_size) {}
public:
~Descriptor();
static infiniStatus_t create(infiniopHandle_t handle, Descriptor **desc_ptr,
infiniopTensorDescriptor_t c_desc,
std::vector<infiniopTensorDescriptor_t> input_descs);
size_t workspaceSize() const { return _workspace_size; }
infiniStatus_t calculate(
void *workspace,
size_t workspace_size,
void *c,
std::vector<const void *> inputs,
void *stream) const;
};
} // namespace op::swiglu::ascend
#endif // __ACLNN_SWIGLU_H__
#include "../../../devices/ascend/ascend_kernel_common.h"
using namespace AscendC;
template <typename T>
class SwigluKernel {
public:
__aicore__ inline SwigluKernel() {}
__aicore__ inline void init(GM_ADDR c, GM_ADDR a, GM_ADDR b, int64_t batch_, int64_t seq, int64_t hd,
int64_t stride_batch_c, int64_t stride_batch_a, int64_t stride_batch_b,
int64_t stride_seq_c, int64_t stride_seq_a, int64_t stride_seq_b);
__aicore__ inline void process();
private:
__aicore__ inline void copyIn(int64_t i);
__aicore__ inline void compute(int64_t i);
__aicore__ inline void copyOut(int64_t i);
private:
GlobalTensor<T> _c_gm, _a_gm, _b_gm;
TQue<QuePosition::VECIN, BUFFER_NUM> _in_queue_a, _in_queue_b;
TQue<QuePosition::VECOUT, BUFFER_NUM> _out_queue_c;
TPipe _pipe;
float _beta_value = 1.0f;
int64_t _block_idx, _tile_len, _copy_len,
_batch, _seq_len, _hidden_size,
_stride_seq_a, _stride_seq_b, _stride_seq_c;
int64_t _stride_batch_a = 1, _stride_batch_b = 1, _stride_batch_c = 1;
};
template <typename T>
__aicore__ inline void SwigluKernel<T>::init(GM_ADDR c, GM_ADDR a, GM_ADDR b, int64_t batch_, int64_t seq, int64_t hd,
int64_t stride_batch_c, int64_t stride_batch_a, int64_t stride_batch_b,
int64_t stride_seq_c, int64_t stride_seq_a, int64_t stride_seq_b) {
// Init Shape & StrideVariables
_batch = batch_;
_seq_len = seq;
_hidden_size = hd;
_stride_batch_a = stride_batch_a;
_stride_batch_b = stride_batch_b;
_stride_batch_c = stride_batch_c;
_stride_seq_a = stride_seq_a;
_stride_seq_b = stride_seq_b;
_stride_seq_c = stride_seq_c;
_block_idx = GetBlockIdx();
_tile_len = _block_idx < (_hidden_size % BLOCK_NUM) ? (_hidden_size / BLOCK_NUM) + 1 : (_hidden_size / BLOCK_NUM);
_copy_len = (_tile_len * sizeof(T)) % BYTE_ALIGN == 0 ? _tile_len : (_tile_len * sizeof(T) + (BYTE_ALIGN - _tile_len * sizeof(T) % BYTE_ALIGN)) / sizeof(T);
// Set global tensor
_a_gm.SetGlobalBuffer((__gm__ T *)a);
_b_gm.SetGlobalBuffer((__gm__ T *)b);
_c_gm.SetGlobalBuffer((__gm__ T *)c);
// _pipe alloc memory to queue, the unit is bytes
_pipe.InitBuffer(_in_queue_a, BUFFER_NUM, _copy_len * sizeof(T));
_pipe.InitBuffer(_in_queue_b, BUFFER_NUM, _copy_len * sizeof(T));
_pipe.InitBuffer(_out_queue_c, BUFFER_NUM, _copy_len * sizeof(T));
}
template <typename T>
__aicore__ inline void SwigluKernel<T>::copyIn(int64_t i) {
// Alloc tensor from queue memory
LocalTensor<T> aLocal = _in_queue_a.AllocTensor<T>();
LocalTensor<T> bLocal = _in_queue_b.AllocTensor<T>();
// Get idx of current tile
auto batch_idx = _batch == 1 ? 0 : i / _seq_len;
auto seq_idx = _batch == 1 ? i : i % _seq_len;
int64_t idxa = batch_idx * _stride_batch_a + seq_idx * _stride_seq_a + _block_idx * _tile_len;
int64_t idxb = batch_idx * _stride_batch_b + seq_idx * _stride_seq_b + _block_idx * _tile_len;
// Copy process_th tile from global tensor to local tensor
DataCopy(aLocal, _a_gm[idxa], _copy_len);
DataCopy(bLocal, _b_gm[idxb], _copy_len);
// Enque input tensor to VECIN queue
_in_queue_a.EnQue(aLocal);
_in_queue_b.EnQue(bLocal);
}
template <typename T>
__aicore__ inline void SwigluKernel<T>::compute(int64_t i) {
// Deque input tensors from VECIN queue
LocalTensor<T> aLocal = _in_queue_a.DeQue<T>();
LocalTensor<T> bLocal = _in_queue_b.DeQue<T>();
LocalTensor<T> cLocal = _out_queue_c.AllocTensor<T>();
// Call SwiGLU ascend api
SwiGLU<T, false>(cLocal, aLocal, bLocal, _beta_value, _copy_len);
// Enque result and free input
_out_queue_c.EnQue<T>(cLocal);
_in_queue_a.FreeTensor(aLocal);
_in_queue_b.FreeTensor(bLocal);
}
template <typename T>
__aicore__ inline void SwigluKernel<T>::copyOut(int64_t i) {
// Deque output tensor from VECOUT queue
LocalTensor<T> cLocal = _out_queue_c.DeQue<T>();
auto batch_idx = _batch == 1 ? 0 : i / _seq_len;
auto seq_idx = _batch == 1 ? i : i % _seq_len;
int64_t idxc = batch_idx * _stride_batch_c + seq_idx * _stride_seq_c + _block_idx * _tile_len;
// Copy progress_th tile from local tensor to global tensor
if (_tile_len * sizeof(T) % BYTE_ALIGN != 0) {
DataCopyExtParams dcep = {1, static_cast<uint32_t>(_tile_len * sizeof(T)), 0, 0, 0};
DataCopyPad(_c_gm[idxc], cLocal, dcep);
} else {
DataCopy(_c_gm[idxc], cLocal, _tile_len);
}
// Free output Local tensor
_out_queue_c.FreeTensor(cLocal);
}
template <typename T>
__aicore__ inline void SwigluKernel<T>::process() {
for (int64_t i = 0; i < _batch * _seq_len; ++i) {
copyIn(i);
compute(i);
copyOut(i);
}
}
#define DEFINE_SWIGLU_KERNEL(KERNEL_NAME, TYPE) \
__global__ __aicore__ void KERNEL_NAME(GM_ADDR c, GM_ADDR a, GM_ADDR b, \
int64_t batch, int64_t seq, int64_t hd, \
int64_t stride_batch_c, \
int64_t stride_batch_a, \
int64_t stride_batch_b, \
int64_t stride_seq_c, \
int64_t stride_seq_a, \
int64_t stride_seq_b) { \
SwigluKernel<TYPE> op; \
op.init(c, a, b, \
batch, seq, hd, \
stride_batch_c, stride_batch_a, stride_batch_b, \
stride_seq_c, stride_seq_a, stride_seq_b); \
op.process(); \
}
DEFINE_SWIGLU_KERNEL(swiglu_kernel_half, half)
DEFINE_SWIGLU_KERNEL(swiglu_kernel_float, float)
#undef DEFINE_SWIGLU_KERNEL
extern "C" infiniStatus_t swiglu_kernel_launch(
void *c, void *a, void *b,
infiniDtype_t dtype, size_t batch, size_t seq, size_t hd,
ptrdiff_t stride_batch_c, ptrdiff_t stride_batch_a, ptrdiff_t stride_batch_b,
ptrdiff_t stride_seq_c, ptrdiff_t stride_seq_a, ptrdiff_t stride_seq_b, void *stream) {
#define LAUNCH_SWIGLU_KERNEL(DTYPE_ENUM, KERNEL_NAME) \
case DTYPE_ENUM: \
KERNEL_NAME<<<BLOCK_NUM, nullptr, stream>>>( \
c, a, b, \
static_cast<int64_t>(batch), \
static_cast<int64_t>(seq), \
static_cast<int64_t>(hd), \
stride_batch_c, stride_batch_a, stride_batch_b, \
stride_seq_c, stride_seq_a, stride_seq_b); \
return INFINI_STATUS_SUCCESS;
switch (dtype) {
LAUNCH_SWIGLU_KERNEL(INFINI_DTYPE_F16, swiglu_kernel_half)
LAUNCH_SWIGLU_KERNEL(INFINI_DTYPE_F32, swiglu_kernel_float)
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
#undef LAUNCH_SWIGLU_KERNEL
}
...@@ -11,6 +11,9 @@ ...@@ -11,6 +11,9 @@
#ifdef ENABLE_KUNLUN_API #ifdef ENABLE_KUNLUN_API
#include "kunlun/swiglu_kunlun.h" #include "kunlun/swiglu_kunlun.h"
#endif #endif
#ifdef ENABLE_ASCEND_API
#include "ascend/swiglu_ascend.h"
#endif
__C infiniStatus_t infiniopCreateSwiGLUDescriptor( __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
infiniopHandle_t handle, infiniopHandle_t handle,
...@@ -46,11 +49,8 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor( ...@@ -46,11 +49,8 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
c_desc, a_desc, b_desc); c_desc, a_desc, b_desc);
} }
#endif #endif
#ifdef ENABLE_ASCEND_NPU #ifdef ENABLE_ASCEND_API
case DevAscendNpu: CREATE(INFINI_DEVICE_ASCEND, ascend);
return ascendCreateSwiGLUDescriptor(
(AscendHandle_t)handle, (SwiGLUAscendDescriptor_t *)desc_ptr,
c_desc, a_desc, b_desc);
#endif #endif
#ifdef ENABLE_METAX_GPU #ifdef ENABLE_METAX_GPU
case DevMetaxGpu: { case DevMetaxGpu: {
...@@ -94,7 +94,7 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des ...@@ -94,7 +94,7 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
return bangGetSwiGLUWorkspaceSize((SwiGLUBangDescriptor_t)desc, size); return bangGetSwiGLUWorkspaceSize((SwiGLUBangDescriptor_t)desc, size);
} }
#endif #endif
#ifdef ENABLE_ASCEND_NPU #ifdef ENABLE_ASCEND_API
GET(INFINI_DEVICE_ASCEND, ascend) GET(INFINI_DEVICE_ASCEND, ascend)
#endif #endif
#ifdef ENABLE_METAX_GPU #ifdef ENABLE_METAX_GPU
...@@ -144,9 +144,8 @@ __C infiniStatus_t infiniopSwiGLU( ...@@ -144,9 +144,8 @@ __C infiniStatus_t infiniopSwiGLU(
return bangSwiGLU((SwiGLUBangDescriptor_t)desc, c, a, b, stream); return bangSwiGLU((SwiGLUBangDescriptor_t)desc, c, a, b, stream);
} }
#endif #endif
#ifdef ENABLE_ASCEND_NPU #ifdef ENABLE_ASCEND_API
case DevAscendNpu: CALCULATE(INFINI_DEVICE_ASCEND, ascend);
return ascendSwiGLU((SwiGLUAscendDescriptor_t)desc, c, a, b, stream);
#endif #endif
#ifdef ENABLE_METAX_GPU #ifdef ENABLE_METAX_GPU
case DevMetaxGpu: case DevMetaxGpu:
...@@ -188,9 +187,8 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) { ...@@ -188,9 +187,8 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
return bangDestroySwiGLUDescriptor((SwiGLUBangDescriptor_t)desc); return bangDestroySwiGLUDescriptor((SwiGLUBangDescriptor_t)desc);
} }
#endif #endif
#ifdef ENABLE_ASCEND_NPU #ifdef ENABLE_ASCEND_API
case DevAscendNpu: DELETE(INFINI_DEVICE_ASCEND, ascend)
return ascendDestroySwiGLUDescriptor((SwiGLUAscendDescriptor_t)desc);
#endif #endif
#ifdef ENABLE_METAX_GPU #ifdef ENABLE_METAX_GPU
case DevMetaxGpu: case DevMetaxGpu:
......
...@@ -37,7 +37,7 @@ _TENSOR_DTYPES = [torch.float16] ...@@ -37,7 +37,7 @@ _TENSOR_DTYPES = [torch.float16]
# Tolerance map for different data types # Tolerance map for different data types
_TOLERANCE_MAP = { _TOLERANCE_MAP = {
torch.float16: {"atol": 0, "rtol": 1e-2}, torch.float16: {"atol": 1e-3, "rtol": 1e-2},
} }
...@@ -143,6 +143,9 @@ def test( ...@@ -143,6 +143,9 @@ def test(
) )
lib_causal_softmax() lib_causal_softmax()
if sync is not None:
sync()
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG: if DEBUG:
......
...@@ -476,10 +476,11 @@ def get_test_devices(args): ...@@ -476,10 +476,11 @@ def get_test_devices(args):
def get_sync_func(device): def get_sync_func(device):
import torch import torch
device_str = infiniDeviceEnum_str_map[device]
if device == InfiniDeviceEnum.CPU: if device == InfiniDeviceEnum.CPU:
sync = None sync = None
else: else:
sync = getattr(torch, infiniDeviceEnum_str_map[device]).synchronize sync = getattr(torch, device_str).synchronize
return sync return sync
...@@ -50,9 +50,8 @@ target("infiniop-ascend") ...@@ -50,9 +50,8 @@ target("infiniop-ascend")
add_files("$(projectdir)/src/infiniop/devices/ascend/*.cc", "$(projectdir)/src/infiniop/ops/*/ascend/*.cc") add_files("$(projectdir)/src/infiniop/devices/ascend/*.cc", "$(projectdir)/src/infiniop/ops/*/ascend/*.cc")
-- Add operator -- Add operator
-- TODO: add it back after ascend-kernels is fixed add_rules("ascend-kernels")
-- add_rules("ascend-kernels") add_links(builddir.."/libascend_kernels.a")
-- add_links(builddir.."/libascend_kernels.a")
target_end() target_end()
target("infinirt-ascend") target("infinirt-ascend")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment