Unverified Commit e77735ef authored by PanZezhong1725's avatar PanZezhong1725 Committed by GitHub
Browse files

Merge pull request #65 from YdrMaster/main

issue/63 重构算子定义文件结构及风格修改
parents b7893d65 3144cc9c
......@@ -2,7 +2,7 @@
#define __INFINIOP_HANDLE__
#include "../infinicore.h"
#include "./status.h"
#include "status.h"
typedef struct InfiniopHandle {
infiniDevice_t device;
......
#ifndef __INFINIOP_OPERATOR___
#define __INFINIOP_OPERATOR___
#include "./handle.h"
#include "./tensor_descriptor.h"
#include "handle.h"
#include "tensor_descriptor.h"
// Base descriptor for all operators
typedef struct InfiniopDescriptor {
infiniDevice_t device;
infiniDevice_t device_type;
int device_id;
} InfiniopDescriptor;
__C __export infiniopStatus_t infiniopGetDescriptorDeviceType(const InfiniopDescriptor *desc_ptr, infiniDevice_t *device_type);
__C __export infiniopStatus_t infiniopGetDescriptorDeviceId(const InfiniopDescriptor *desc_ptr, int *device_id);
#endif //__INFINIOP_OPERATOR___
......@@ -2,8 +2,8 @@
#define __INFINIOP_ATTENTION_H__
#include "../operator.h"
#include "./matmul.h"
#include "./swiglu.h"
#include "matmul.h"
#include "swiglu.h"
typedef InfiniopDescriptor *infiniopAttentionDescriptor_t;
......@@ -23,9 +23,9 @@ __C __export infiniopStatus_t infiniopAttention(infiniopAttentionDescriptor_t de
void *workspace,
size_t workspace_size,
void *out,
void const *q,
void const *k,
void const *v,
const void *q,
const void *k,
const void *v,
void *k_cache,
void *v_cache,
void *stream);
......
......@@ -2,8 +2,8 @@
#define __INFINIOP_MLP_H__
#include "../operator.h"
#include "./matmul.h"
#include "./swiglu.h"
#include "matmul.h"
#include "swiglu.h"
typedef InfiniopDescriptor *infiniopMLPDescriptor_t;
......@@ -22,9 +22,9 @@ __C __export infiniopStatus_t infiniopMLP(infiniopMLPDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
void const *x,
void const *w12,
void const *w3,
const void *x,
const void *w12,
const void *w3,
void *stream);
__C __export infiniopStatus_t infiniopDestroyMLPDescriptor(infiniopMLPDescriptor_t desc);
......
......@@ -2,7 +2,7 @@
#define __INFINIOP_TENSOR_DESCRIPTOR__
#include "../infinicore.h"
#include "./status.h"
#include "status.h"
struct InfiniopTensorDescriptor {
// Datatype
......@@ -17,7 +17,7 @@ struct InfiniopTensorDescriptor {
typedef struct InfiniopTensorDescriptor *infiniopTensorDescriptor_t;
__C __export infiniopStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescriptor_t *desc_ptr, size_t ndim, size_t const *shape, ptrdiff_t const *strides, infiniDtype_t dtype);
__C __export infiniopStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescriptor_t *desc_ptr, size_t ndim, const size_t *shape, const ptrdiff_t *strides, infiniDtype_t dtype);
__C __export infiniopStatus_t infiniopDestroyTensorDescriptor(infiniopTensorDescriptor_t desc);
......
......@@ -91,7 +91,7 @@ def git_added_files():
try:
# 使用 git diff --cached --name-only 获取所有已添加到暂存区的文件
result = subprocess.run(
["git", "diff", "--cached", "--name-only"],
["git", "diff", "--cached", "--diff-filter=AMR", "--name-only"],
capture_output=True,
text=True,
check=True,
......@@ -162,7 +162,7 @@ def main():
if args.ref is None and args.path is None:
# Last commit.
print("{Fore.GREEN}Formating git added files.{Style.RESET_ALL}")
print(f"{Fore.GREEN}Formating git added files.{Style.RESET_ALL}")
files = git_added_files()
else:
......
#ifndef __ACLNN_TENSOR__
#define __ACLNN_TENSOR__
#include "./common_ascend.h"
#include "common_ascend.h"
#include "infiniop/operator.h"
#include <acl/acl.h>
#include <acl/acl_base.h>
......
#include "./common_cpu.h"
#include "common_cpu.h"
float f16_to_f32(uint16_t h) {
uint32_t sign = (h & 0x8000) << 16;
......@@ -59,9 +59,11 @@ uint16_t f32_to_f16(float val) {
}
}
size_t indexToReducedOffset(size_t flat_index, size_t ndim,
ptrdiff_t const *broadcasted_strides,
ptrdiff_t const *target_strides) {
size_t indexToReducedOffset(
size_t flat_index,
size_t ndim,
const ptrdiff_t *broadcasted_strides,
const ptrdiff_t *target_strides) {
size_t res = 0;
for (size_t i = 0; i < ndim; ++i) {
res += flat_index / broadcasted_strides[i] * target_strides[i];
......@@ -70,8 +72,11 @@ size_t indexToReducedOffset(size_t flat_index, size_t ndim,
return res;
}
size_t indexToOffset(size_t flat_index, size_t ndim, size_t const *shape,
ptrdiff_t const *strides) {
size_t indexToOffset(
size_t flat_index,
size_t ndim,
const size_t *shape,
const ptrdiff_t *strides) {
size_t res = 0;
for (size_t i = ndim; i-- >= 0;) {
res += (flat_index % shape[i]) * strides[i];
......@@ -80,7 +85,10 @@ size_t indexToOffset(size_t flat_index, size_t ndim, size_t const *shape,
return res;
}
size_t getPaddedSize(size_t ndim, size_t *shape, size_t const *pads) {
size_t getPaddedSize(
size_t ndim,
size_t *shape,
const size_t *pads) {
size_t total_size = 1;
for (size_t i = 0; i < ndim; ++i) {
total_size *= shape[i] + (i < 2 ? 0 : 2 * pads[i - 2]);
......@@ -88,8 +96,10 @@ size_t getPaddedSize(size_t ndim, size_t *shape, size_t const *pads) {
return total_size;
}
std::vector<size_t> getPaddedShape(size_t ndim, size_t const *shape,
size_t const *pads) {
std::vector<size_t> getPaddedShape(
size_t ndim,
const size_t *shape,
const size_t *pads) {
std::vector<size_t> padded_shape(ndim);
memcpy(padded_shape.data(), shape, ndim * sizeof(size_t));
for (size_t i = 2; i < ndim; ++i) {
......
#ifndef __INFINIOP__COMMON_CPU_H__
#define __INFINIOP__COMMON_CPU_H__
#ifndef __INFINIOP_COMMON_CPU_H__
#define __INFINIOP_COMMON_CPU_H__
#include <cmath>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <vector>
......@@ -13,18 +14,18 @@ float f16_to_f32(uint16_t code);
uint16_t f32_to_f16(float val);
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
size_t indexToReducedOffset(size_t flat_index, size_t ndim, ptrdiff_t const *broadcasted_strides, ptrdiff_t const *target_strides);
size_t indexToReducedOffset(size_t flat_index, size_t ndim, const ptrdiff_t *broadcasted_strides, const ptrdiff_t *target_strides);
// return the memory offset a tensor given flattened index
size_t indexToOffset(size_t flat_index, size_t ndim, size_t const *shape, ptrdiff_t const *strides);
size_t indexToOffset(size_t flat_index, size_t ndim, const size_t *shape, const ptrdiff_t *strides);
/**
* get the total array size (element count) after applying padding for a
* ndim-ary tensor with the given shape
*/
size_t getPaddedSize(size_t ndim, size_t *shape, size_t const *pads);
size_t getPaddedSize(size_t ndim, size_t *shape, const size_t *pads);
// calculate the padded shape and store the result in padded_shape
std::vector<size_t> getPaddedShape(size_t ndim, size_t const *shape, size_t const *pads);
std::vector<size_t> getPaddedShape(size_t ndim, const size_t *shape, const size_t *pads);
#endif // __INFINIOP__COMMON_CPU_H__
#include "./cpu_handle.h"
#include "cpu_handle.h"
infiniopStatus_t createCpuHandle(infiniopCpuHandle_t *handle_ptr) {
*handle_ptr = new InfiniopHandle{INFINI_DEVICE_CPU, 0};
......
......@@ -48,26 +48,25 @@ struct InfiniopCudaHandle {
};
template <typename T>
void use_cublas(std::shared_ptr<Pool<cublasHandle_t>> cublas_handle_pool, int device_id, cudaStream_t stream, T const &f) {
auto handle = cublas_handle_pool->pop();
void use_cublas(std::shared_ptr<Pool<cublasHandle_t>> &pool, cudaStream_t stream, const T &f) {
auto handle = pool->pop();
if (!handle) {
cublasCreate(&(*handle));
}
cublasSetStream(*handle, (cudaStream_t)stream);
cublasSetStream(*handle, stream);
f(*handle);
cublas_handle_pool->push(std::move(*handle));
pool->push(std::move(*handle));
}
template <typename T>
cudnnStatus_t use_cudnn(std::shared_ptr<Pool<cudnnHandle_t>> cudnn_handle_pool, int device_id, cudaStream_t stream, T const &f) {
auto handle = cudnn_handle_pool->pop();
void use_cudnn(std::shared_ptr<Pool<cudnnHandle_t>> &pool, cudaStream_t stream, const T &f) {
auto handle = pool->pop();
if (!handle) {
cudnnCreate(&(*handle));
}
cudnnSetStream(*handle, stream);
cudnnStatus_t status = f(*handle);
cudnn_handle_pool->push(std::move(*handle));
return status;
f(*handle);
pool->push(std::move(*handle));
}
inline cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
......@@ -96,8 +95,10 @@ inline cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
// return the memory offset of original tensor, given the flattened index of
// broadcasted tensor
inline __device__ __host__ size_t indexToReducedOffset(
size_t flat_index, size_t ndim, ptrdiff_t const *broadcasted_strides,
ptrdiff_t const *target_strides) {
size_t flat_index,
size_t ndim,
const ptrdiff_t *broadcasted_strides,
const ptrdiff_t *target_strides) {
size_t res = 0;
for (size_t i = 0; i < ndim; ++i) {
res += flat_index / broadcasted_strides[i] * target_strides[i];
......@@ -107,9 +108,11 @@ inline __device__ __host__ size_t indexToReducedOffset(
}
// get the memory offset of the given element in a tensor given its flat index
inline __device__ __host__ size_t indexToOffset(size_t flat_index, size_t ndim,
size_t const *shape,
ptrdiff_t const *strides) {
inline __device__ __host__ size_t indexToOffset(
size_t flat_index,
size_t ndim,
const size_t *shape,
const ptrdiff_t *strides) {
size_t res = 0;
for (size_t i = ndim; i-- > 0;) {
res += (flat_index % shape[i]) * strides[i];
......
#include "./common_cuda.cuh"
#include "common_cuda.cuh"
infiniopStatus_t createCudaHandle(infiniopCudaHandle_t *handle_ptr, infiniDevice_t cuda_device_type) {
// Create a new cublas handle pool
......
#include "infiniop/handle.h"
#ifdef ENABLE_CPU_API
#include "./cpu/cpu_handle.h"
#include "cpu/cpu_handle.h"
#endif
#ifdef ENABLE_CUDA_API
#include "./cuda/cuda_handle.h"
#include "cuda/cuda_handle.h"
#endif
#ifdef ENABLE_CAMBRICON_API
#include "./bang/bang_handle.h"
#include "bang/bang_handle.h"
#endif
#ifdef ENABLE_ASCEND_API
#include "./ascend/ascend_handle.h"
#include "ascend/ascend_handle.h"
#endif
__C infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr,
......
#include "infiniop/operator.h"
infiniopStatus_t infiniopGetDescriptorDeviceType(
const InfiniopDescriptor *desc_ptr,
infiniDevice_t *device_type) {
*device_type = desc_ptr->device_type;
return INFINIOP_STATUS_SUCCESS;
}
infiniopStatus_t infiniopGetDescriptorDeviceId(
const InfiniopDescriptor *desc_ptr,
int *device_id) {
*device_id = desc_ptr->device_id;
return INFINIOP_STATUS_SUCCESS;
}
......@@ -41,7 +41,7 @@ __C infiniopStatus_t infiniopCreateCausalSoftmaxDescriptor(
}
__C infiniopStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDescriptor_t desc, size_t *size) {
switch (desc->device) {
switch (desc->device_type) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCpuDescriptor_t)desc, size);
......@@ -79,7 +79,7 @@ __C infiniopStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmax
}
__C infiniopStatus_t infiniopCausalSoftmax(infiniopCausalSoftmaxDescriptor_t desc, void *workspace, size_t workspace_size, void *data, void *stream) {
switch (desc->device) {
switch (desc->device_type) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuCausalSoftmax((CausalSoftmaxCpuDescriptor_t)desc, workspace, workspace_size, data, stream);
......@@ -116,7 +116,7 @@ __C infiniopStatus_t infiniopCausalSoftmax(infiniopCausalSoftmaxDescriptor_t des
}
__C infiniopStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxDescriptor_t desc) {
switch (desc->device) {
switch (desc->device_type) {
#ifdef ENABLE_CPU
case DevCpu:
return cpuDestroyCausalSoftmaxDescriptor((CausalSoftmaxCpuDescriptor_t)desc);
......
#include "matmul_aclnn.h"
InfiniopMatmulAclnnDescriptor::InfiniopMatmulAclnnDescriptor(
infiniDevice_t _device) {
device = _device;
device_id = 0;
executor = nullptr;
info = nullptr;
cDesc = new aclnnTensorDescriptor();
aDesc = new aclnnTensorDescriptor();
bDesc = new aclnnTensorDescriptor();
mt = 1;
workspaceSize = 0;
}
infiniopStatus_t aclnnCreateMatmulDescriptor(infiniopAscendHandle_t handle,
MatmulAclnnDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc,
int8_t mt) {
infiniDtype_t dtype = c_desc->dtype;
if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
}
*desc_ptr = new InfiniopMatmulAclnnDescriptor(handle->device);
(*desc_ptr)->device_id = handle->device_id;
(*desc_ptr)->dtype = dtype;
(*desc_ptr)->mt = mt;
infiniopStatus_t status;
auto info = new MatmulInfo(c_desc, a_desc, b_desc, &status, false);
if (status != INFINIOP_STATUS_SUCCESS) {
return status;
}
(*desc_ptr)->info = info;
auto &cDesc = (*desc_ptr)->cDesc;
auto &aDesc = (*desc_ptr)->aDesc;
auto &bDesc = (*desc_ptr)->bDesc;
// Treat A, B, C as 2D matrix, reuse aclnnTensorDescriptor for batched
// operation
CHECK_STATUS(cDesc->setDescriptor(
toAclDataType(c_desc->dtype),
{static_cast<int64_t>(info->c_matrix.rows),
static_cast<int64_t>(info->c_matrix.cols)},
{info->c_matrix.row_stride, info->c_matrix.col_stride}),
INFINIOP_STATUS_SUCCESS);
CHECK_STATUS(aDesc->setDescriptor(
toAclDataType(a_desc->dtype),
{static_cast<int64_t>(info->a_matrix.rows),
static_cast<int64_t>(info->a_matrix.cols)},
{info->a_matrix.row_stride, info->a_matrix.col_stride}),
INFINIOP_STATUS_SUCCESS);
CHECK_STATUS(bDesc->setDescriptor(
toAclDataType(b_desc->dtype),
{static_cast<int64_t>(info->b_matrix.rows),
static_cast<int64_t>(info->b_matrix.cols)},
{info->b_matrix.row_stride, info->b_matrix.col_stride}),
INFINIOP_STATUS_SUCCESS);
CHECK_STATUS(cDesc->createTensor(), INFINIOP_STATUS_SUCCESS);
CHECK_STATUS(aDesc->createTensor(), INFINIOP_STATUS_SUCCESS);
CHECK_STATUS(bDesc->createTensor(), INFINIOP_STATUS_SUCCESS);
auto &workspaceSize = (*desc_ptr)->workspaceSize;
auto &executor = (*desc_ptr)->executor;
aclTensor *tc = cDesc->t;
aclTensor *ta = aDesc->t;
aclTensor *tb = bDesc->t;
aclnnStatus ret;
int64_t transA = 0;
int64_t transB = 0;
// aclnnGemm support C = alpha * A @ B + beta * C
// see
// https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/aolapi/context/aclnnGemm.md
// use alpha = 0.5, beta = 0.5 temporarily
ret = aclnnGemmGetWorkspaceSize(ta, tb, tc, 0.5f, 0.5f, transA, transB, tc,
(*desc_ptr)->mt, &workspaceSize, &executor);
CHECK_RET(ret == ACL_SUCCESS,
LOG_PRINT("aclnnGemmGetWorkspaceSize failed. ERROR: %d\n", ret);
return INFINIOP_STATUS_INTERNAL_ERROR);
aclSetAclOpExecutorRepeatable(executor);
return INFINIOP_STATUS_SUCCESS;
}
infiniopStatus_t aclnnGetMatmulWorkspaceSize(MatmulAclnnDescriptor_t desc,
size_t *size) {
*size = desc->workspaceSize;
return INFINIOP_STATUS_SUCCESS;
}
infiniopStatus_t aclnnMatmul(MatmulAclnnDescriptor_t desc, void *workspace,
size_t workspace_size, void *c, void const *a,
void const *b, float alpha, float beta,
void *stream) {
auto &cDesc = desc->cDesc;
auto &aDesc = desc->aDesc;
auto &bDesc = desc->bDesc;
aclTensor *tc = cDesc->t;
aclTensor *ta = aDesc->t;
aclTensor *tb = bDesc->t;
auto batch = desc->info->batch;
size_t workspaceSize;
aclnnStatus ret;
ret = aclnnGemmGetWorkspaceSize(ta, tb, tc, alpha, beta, 0, 0, tc, desc->mt,
&workspaceSize, &(desc->executor));
CHECK_RET(ret == ACL_SUCCESS,
LOG_PRINT("aclnnGemmGetWorkspaceSize failed. ERROR: %d\n", ret);
return INFINIOP_STATUS_INTERNAL_ERROR);
if (workspace_size < workspaceSize) {
return INFINIOP_STATUS_INSUFFICIENT_WORKSPACE;
}
aclSetAclOpExecutorRepeatable(desc->executor);
for (size_t i = 0; i < batch; i++) {
AclSetTensorAddr(desc->executor, 0, ta,
(char *)(a) + i * desc->info->a_matrix.stride * infiniSizeof(desc->dtype));
AclSetTensorAddr(desc->executor, 1, tb,
(char *)(b) + i * desc->info->b_matrix.stride * infiniSizeof(desc->dtype));
AclSetTensorAddr(desc->executor, 2, tc,
(char *)(c) + i * desc->info->c_matrix.stride * infiniSizeof(desc->dtype));
AclSetTensorAddr(desc->executor, 3, tc,
(char *)(c) + i * desc->info->c_matrix.stride * infiniSizeof(desc->dtype));
ret = aclnnGemm(workspace, workspaceSize, desc->executor, stream);
CHECK_RET(ret == ACL_SUCCESS,
LOG_PRINT("aclnnGemm failed. ERROR: %d\n", ret);
return INFINIOP_STATUS_INTERNAL_ERROR);
}
return INFINIOP_STATUS_SUCCESS;
}
infiniopStatus_t aclnnDestroyMatmulDescriptor(MatmulAclnnDescriptor_t desc) {
delete desc->cDesc;
delete desc->bDesc;
delete desc->aDesc;
delete desc->info;
aclDestroyAclOpExecutor(desc->executor);
delete desc;
return INFINIOP_STATUS_SUCCESS;
}
#ifndef __ACLNN_MATMUL_H__
#define __ACLNN_MATMUL_H__
#include "../../../devices/ascend/tensor_aclnn.h"
#include "../../utils.h"
#include "../blas.h"
#include "matmul_aclnn_api.h"
#include <acl/acl_base.h>
#include <aclnn/acl_meta.h>
#include <aclnnop/aclnn_matmul.h>
#include <aclnnop/level2/aclnn_gemm.h>
struct InfiniopMatmulAclnnDescriptor {
infiniDevice_t device;
int device_id;
aclOpExecutor *executor;
MatmulInfo *info;
infiniDtype_t dtype;
aclnnTensorDescriptor_t cDesc, aDesc, bDesc;
// cubeMathType
// see doc:
// https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha002/apiref/appdevgapi/context/aclnnBatchMatMul.md
int8_t mt;
size_t workspaceSize;
InfiniopMatmulAclnnDescriptor(infiniDevice_t _device);
};
#endif
#ifndef __INFINIOP_MATMUL_ACLNN_API_H__
#define __INFINIOP_MATMUL_ACLNN_API_H__
#include "../../../devices/ascend/ascend_handle.h"
#include "infiniop/operator.h"
struct InfiniopMatmulAclnnDescriptor;
typedef struct InfiniopMatmulAclnnDescriptor *MatmulAclnnDescriptor_t;
infiniopStatus_t aclnnCreateMatmulDescriptor(infiniopAscendHandle_t handle,
MatmulAclnnDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc,
int8_t cubeMathType);
infiniopStatus_t aclnnGetMatmulWorkspaceSize(MatmulAclnnDescriptor_t desc,
size_t *size);
infiniopStatus_t aclnnMatmul(MatmulAclnnDescriptor_t desc, void *workspace,
size_t workspace_size, void *c, const void *a,
const void *b, float alpha, float beta,
void *stream);
infiniopStatus_t aclnnDestroyMatmulDescriptor(MatmulAclnnDescriptor_t desc);
#endif // __INFINIOP_MATMUL_ACLNN_API_H__
#include "matmul_ascend.h"
#include "../../../devices/ascend/ascend_handle.h"
#include "../../../devices/ascend/tensor_aclnn.h"
#include "../../utils.h"
#include <acl/acl_base.h>
#include <aclnn/acl_meta.h>
#include <aclnnop/aclnn_matmul.h>
#include <aclnnop/level2/aclnn_gemm.h>
namespace matmul::ascend {
struct Descriptor::Opaque {
mutable aclOpExecutor *executor;
aclnnTensorDescriptor_t c, a, b;
// cubeMathType
// see doc:
// https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha002/apiref/appdevgapi/context/aclnnBatchMatMul.md
int8_t mt;
~Opaque() {
delete c;
delete a;
delete b;
aclDestroyAclOpExecutor(executor);
}
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniopStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc) {
auto handle = reinterpret_cast<infiniopAscendHandle_t>(handle_);
auto dtype = c_desc->dtype;
if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
}
infiniopStatus_t status;
auto info = MatmulInfo(c_desc, a_desc, b_desc, &status, MatrixLayout::ROW_MAJOR);
if (status != INFINIOP_STATUS_SUCCESS) {
return status;
}
auto c = new aclnnTensorDescriptor(),
a = new aclnnTensorDescriptor(),
b = new aclnnTensorDescriptor();
// Treat A, B, C as 2D matrix, reuse aclnnTensorDescriptor for batched
// operation
CHECK_STATUS(c->setDescriptor(
toAclDataType(c_desc->dtype),
{static_cast<int64_t>(info.c_matrix.rows),
static_cast<int64_t>(info.c_matrix.cols)},
{info.c_matrix.row_stride, info.c_matrix.col_stride}),
INFINIOP_STATUS_SUCCESS);
CHECK_STATUS(a->setDescriptor(
toAclDataType(a_desc->dtype),
{static_cast<int64_t>(info.a_matrix.rows),
static_cast<int64_t>(info.a_matrix.cols)},
{info.a_matrix.row_stride, info.a_matrix.col_stride}),
INFINIOP_STATUS_SUCCESS);
CHECK_STATUS(b->setDescriptor(
toAclDataType(b_desc->dtype),
{static_cast<int64_t>(info.b_matrix.rows),
static_cast<int64_t>(info.b_matrix.cols)},
{info.b_matrix.row_stride, info.b_matrix.col_stride}),
INFINIOP_STATUS_SUCCESS);
CHECK_STATUS(c->createTensor(), INFINIOP_STATUS_SUCCESS);
CHECK_STATUS(a->createTensor(), INFINIOP_STATUS_SUCCESS);
CHECK_STATUS(b->createTensor(), INFINIOP_STATUS_SUCCESS);
auto tc = c->t,
ta = a->t,
tb = b->t;
aclOpExecutor *executor;
size_t workspace_size;
// aclnnGemm support C = alpha * A @ B + beta * C
// see
// https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/aolapi/context/aclnnGemm.md
// use alpha = 0.5, beta = 0.5 temporarily
int8_t mt = 1;
auto ret = aclnnGemmGetWorkspaceSize(ta, tb, tc, .5, .5, 0, 0, tc, mt, &workspace_size, &executor);
CHECK_RET(ret == ACL_SUCCESS,
LOG_PRINT("aclnnGemmGetWorkspaceSize failed. ERROR: %d\n", ret);
return INFINIOP_STATUS_INTERNAL_ERROR);
aclSetAclOpExecutorRepeatable(executor);
*desc_ptr = new Descriptor(
dtype, info, workspace_size,
new Opaque{
executor,
c,
a,
b,
mt,
},
handle->device, handle->device_id);
return INFINIOP_STATUS_SUCCESS;
}
infiniopStatus_t Descriptor::calculate(
void *workspace,
size_t workspaceSize_,
void *c,
float beta,
const void *a,
const void *b,
float alpha,
void *stream) const {
auto tc = _opaque->c->t,
ta = _opaque->a->t,
tb = _opaque->b->t;
size_t workspace_size;
auto ret = aclnnGemmGetWorkspaceSize(
ta, tb, tc, alpha, beta, 0, 0, tc, _opaque->mt,
&workspace_size, &(_opaque->executor));
CHECK_RET(ret == ACL_SUCCESS,
LOG_PRINT("aclnnGemmGetWorkspaceSize failed. ERROR: %d\n", ret);
return INFINIOP_STATUS_INTERNAL_ERROR);
if (workspaceSize_ < workspace_size) {
return INFINIOP_STATUS_INSUFFICIENT_WORKSPACE;
}
aclSetAclOpExecutorRepeatable(_opaque->executor);
auto unit = infiniSizeof(_dtype);
for (size_t i = 0; i < _info.batch; ++i) {
AclSetTensorAddr(_opaque->executor, 0, ta, ((char *)a) + i * _info.a_matrix.stride * unit);
AclSetTensorAddr(_opaque->executor, 1, tb, ((char *)b) + i * _info.b_matrix.stride * unit);
AclSetTensorAddr(_opaque->executor, 2, tc, ((char *)c) + i * _info.c_matrix.stride * unit);
AclSetTensorAddr(_opaque->executor, 3, tc, ((char *)c) + i * _info.c_matrix.stride * unit);
ret = aclnnGemm(workspace, workspace_size, _opaque->executor, stream);
CHECK_RET(ret == ACL_SUCCESS,
LOG_PRINT("aclnnGemm failed. ERROR: %d\n", ret);
return INFINIOP_STATUS_INTERNAL_ERROR);
}
return INFINIOP_STATUS_SUCCESS;
}
} // namespace matmul::ascend
#ifndef __MATMUL_ASCEND_H__
#define __MATMUL_ASCEND_H__
#include "../matmul.h"
DESCRIPTOR(ascend)
#endif // __MATMUL_ASCEND_H__
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment