Commit 58c0de0c authored by Pan Zezhong's avatar Pan Zezhong
Browse files

feat: ascend matmul

parent 46da1a27
......@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.16.0)
# project information
project(Ascend_C)
set(SOC_VERSION "Ascend910B3" CACHE STRING "system on chip type")
set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest" CACHE PATH "ASCEND CANN package installation directory")
set(ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_HOME} CACHE PATH "ASCEND CANN package installation directory")
set(RUN_MODE "npu" CACHE STRING "run mode: npu")
set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE)
......
#include "ascend_handle.h"
#include "common_ascend.h"
infiniopStatus_t createAscendHandle(AscendHandle_t *handle_ptr, int device_id) {
infiniopStatus_t createAscendHandle(infiniopAscendHandle_t *handle_ptr, int device_id) {
uint32_t device_count;
aclrtGetDeviceCount(&device_count);
if (device_id >= static_cast<int>(device_count)) {
return STATUS_BAD_DEVICE;
return INFINIOP_STATUS_BAD_DEVICE;
}
auto ret = aclrtSetDevice(device_id);
CHECK_RET(ret == ACL_SUCCESS,
LOG_PRINT("aclrtSetDevice failed. ERROR: %d\n", ret));
*handle_ptr = new AscendContext{DevAscendNpu, device_id};
*handle_ptr = new InfiniopAscendHandle{INFINI_DEVICE_ASCEND, device_id};
return STATUS_SUCCESS;
return INFINIOP_STATUS_SUCCESS;
}
infiniopStatus_t deleteAscendHandle(AscendHandle_t handle_ptr) {
infiniopStatus_t deleteAscendHandle(infiniopAscendHandle_t handle_ptr) {
delete handle_ptr;
return STATUS_SUCCESS;
return INFINIOP_STATUS_SUCCESS;
}
#ifndef ASCEND_HANDLE_H
#define ASCEND_HANDLE_H
#ifndef __INFINIOP_ASCEND_HANDLE_H__
#define __INFINIOP_ASCEND_HANDLE_H__
#include "common_ascend.h"
#include "device.h"
#include "status.h"
#include <acl/acl.h>
#include <acl/acl_base.h>
#include <acl/acl_rt.h>
#include <aclnn/acl_meta.h>
#include <memory>
struct AscendContext {
Device device;
int device_id;
};
typedef struct AscendContext *AscendHandle_t;
#include "infinicore.h"
#include "infiniop/handle.h"
infiniopStatus_t createAscendHandle(AscendHandle_t *handle_ptr, int device_id);
struct InfiniopAscendHandle;
typedef struct InfiniopAscendHandle *infiniopAscendHandle_t;
infiniopStatus_t deleteAscendHandle(AscendHandle_t handle_ptr);
infiniopStatus_t createAscendHandle(infiniopAscendHandle_t *handle_ptr, int device_id);
infiniopStatus_t deleteAscendHandle(infiniopAscendHandle_t handle_ptr);
#endif
......@@ -8,16 +8,16 @@ int64_t numElements(const int64_t *shape, int64_t num) {
return numEle;
}
infiniopStatus_t mallocWorkspace(void **workspaceAddr, uint64_t workspaceSize) {
infiniopStatus_t mallocWorkspace(void **workspaceAddr, size_t workspaceSize) {
*workspaceAddr = nullptr;
if (workspaceSize > 0) {
auto ret = aclrtMalloc(workspaceAddr, workspaceSize,
ACL_MEM_MALLOC_HUGE_FIRST);
CHECK_RET(ret == ACL_SUCCESS,
LOG_PRINT("aclrtMalloc failed. ERROR: %d\n", ret);
return STATUS_EXECUTION_FAILED);
return INFINIOP_STATUS_INTERNAL_ERROR);
}
return STATUS_SUCCESS;
return INFINIOP_STATUS_SUCCESS;
}
infiniopStatus_t freeWorkspace(void *workspaceAddr) {
......@@ -25,35 +25,35 @@ infiniopStatus_t freeWorkspace(void *workspaceAddr) {
auto ret = aclrtFree(workspaceAddr);
CHECK_RET(ret == ACL_SUCCESS,
LOG_PRINT("aclrtFree failed, ERROR: %d\n", ret);
return STATUS_EXECUTION_FAILED);
return INFINIOP_STATUS_INTERNAL_ERROR);
}
return STATUS_SUCCESS;
return INFINIOP_STATUS_SUCCESS;
}
aclDataType toAclDataType(DT dt) {
if (dt == I8)
aclDataType toAclDataType(infiniDtype_t dt) {
if (dt == INFINI_DTYPE_I8)
return aclDataType::ACL_INT8;
else if (dt == I16)
else if (dt == INFINI_DTYPE_I16)
return aclDataType::ACL_INT16;
else if (dt == I32)
else if (dt == INFINI_DTYPE_I32)
return aclDataType::ACL_INT32;
else if (dt == I64)
else if (dt == INFINI_DTYPE_I64)
return aclDataType::ACL_INT64;
else if (dt == U8)
else if (dt == INFINI_DTYPE_U8)
return aclDataType::ACL_UINT8;
else if (dt == U16)
else if (dt == INFINI_DTYPE_U16)
return aclDataType::ACL_UINT16;
else if (dt == U32)
else if (dt == INFINI_DTYPE_U32)
return aclDataType::ACL_UINT32;
else if (dt == U64)
else if (dt == INFINI_DTYPE_U64)
return aclDataType::ACL_UINT64;
else if (dt == F16)
else if (dt == INFINI_DTYPE_F16)
return aclDataType::ACL_FLOAT16;
else if (dt == BF16)
else if (dt == INFINI_DTYPE_BF16)
return aclDataType::ACL_BF16;
else if (dt == F32)
else if (dt == INFINI_DTYPE_F32)
return aclDataType::ACL_FLOAT;
else if (dt == F64)
else if (dt == INFINI_DTYPE_F64)
return aclDataType::ACL_DOUBLE;
else
return aclDataType::ACL_DT_UNDEFINED;
......
#ifndef __COMMON_ASCEND_H__
#define __COMMON_ASCEND_H__
#ifndef __INFINIOP_COMMON_ASCEND_H__
#define __INFINIOP_COMMON_ASCEND_H__
#include "operators.h"
#include "ascend_handle.h"
#include <acl/acl.h>
#include <acl/acl_base.h>
#include <acl/acl_rt.h>
#include <aclnn/acl_meta.h>
#include <cstdio>
#include <functional>
#include <inttypes.h>
......@@ -31,11 +32,16 @@ extern "C" {
};
#endif
struct InfiniopAscendHandle {
infiniDevice_t device;
int device_id;
};
int64_t numElements(const int64_t *shape, int64_t num);
const char *dataTypeToString(aclDataType dtype);
const char *formatToString(aclFormat format);
infiniopStatus_t mallocWorkspace(void **workspaceAddr, uint64_t workspaceSize);
infiniopStatus_t mallocWorkspace(void **workspaceAddr, size_t workspaceSize);
infiniopStatus_t freeWorkspace(void *workspaceAddr);
aclDataType toAclDataType(DT dt);
aclDataType toAclDataType(infiniDtype_t dt);
#endif
......@@ -4,7 +4,7 @@
infiniopStatus_t aclnnTensorDescriptor::setDescriptor(aclDataType dtype, const std::vector<int64_t> &shape, const std::vector<int64_t> &strides) {
if (shape.size() != strides.size()) {
return STATUS_BAD_PARAM;
return INFINIOP_STATUS_BAD_TENSOR_STRIDES;
}
this->ndim = shape.size();
this->shape = std::vector<int64_t>(shape);
......@@ -16,9 +16,9 @@ infiniopStatus_t aclnnTensorDescriptor::setDescriptor(aclDataType dtype, const s
aclFormat format = aclFormat::ACL_FORMAT_ND;
this->format = format;
CHECK_STATUS(this->inferStorageShape(), STATUS_SUCCESS);
CHECK_STATUS(this->inferStorageShape(), INFINIOP_STATUS_SUCCESS);
return STATUS_SUCCESS;
return INFINIOP_STATUS_SUCCESS;
}
......@@ -30,7 +30,7 @@ infiniopStatus_t aclnnTensorDescriptor::inferStorageShape() {
this->storageNdim = 1;
this->storageShape = std::vector<int64_t>({this->shape[max_stride_index] * this->strides[max_stride_index]});
return STATUS_SUCCESS;
return INFINIOP_STATUS_SUCCESS;
}
/// @brief Set aclnnTensorDescriptor from infiniopTensorDescriptor
......@@ -45,7 +45,7 @@ infiniopStatus_t aclnnTensorDescriptor::fromInfiniOpTensorDescriptor(infiniopTen
shape[i] = static_cast<int64_t>(y->shape[i]);
strides[i] = y->strides[i];
}
return setDescriptor(toAclDataType(y->dt), shape, strides);
return setDescriptor(toAclDataType(y->dtype), shape, strides);
}
/// @brief Wrapper of aclCreateTensor. Create aclTensor.
......@@ -56,7 +56,7 @@ infiniopStatus_t aclnnTensorDescriptor::fromInfiniOpTensorDescriptor(infiniopTen
/// @return
infiniopStatus_t aclnnTensorDescriptor::createTensor(void *data) {
if (this->t) {
return STATUS_SUCCESS;
return INFINIOP_STATUS_SUCCESS;
}
this->t = aclCreateTensor(this->shape.data(),
this->ndim,
......@@ -67,17 +67,17 @@ infiniopStatus_t aclnnTensorDescriptor::createTensor(void *data) {
this->storageShape.data(),
this->storageNdim,
data);
return STATUS_SUCCESS;
return INFINIOP_STATUS_SUCCESS;
}
infiniopStatus_t aclnnTensorDescriptor::destroyTensor() {
auto ret = aclDestroyTensor(this->t);
CHECK_RET(ret == ACL_SUCCESS,
LOG_PRINT("aclDesctroyTensor failed, ERROR: %d\n", ret);
return STATUS_EXECUTION_FAILED);
return INFINIOP_STATUS_INTERNAL_ERROR);
t = nullptr;
return STATUS_SUCCESS;
return INFINIOP_STATUS_SUCCESS;
}
aclnnTensorDescriptor::~aclnnTensorDescriptor() {
......
......@@ -2,9 +2,7 @@
#define __ACLNN_TENSOR__
#include "./common_ascend.h"
#include "operators.h"
#include "tensor.h"
#include "tensor/tensor_descriptor.h"
#include "infiniop/operator.h"
#include <acl/acl.h>
#include <acl/acl_base.h>
#include <aclnn/acl_meta.h>
......
......@@ -8,7 +8,7 @@
#ifdef ENABLE_CAMBRICON_MLU
#include "./bang/bang_handle.h"
#endif
#ifdef ENABLE_ASCEND_NPU
#ifdef ENABLE_ASCEND_API
#include "./ascend/ascend_handle.h"
#endif
......@@ -37,7 +37,7 @@ __C infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr, infiniDe
}
#endif
#ifdef ENABLE_ASCEND_API
case DevAscendNpu: {
case INFINI_DEVICE_ASCEND: {
return createAscendHandle((infiniopAscendHandle_t *) handle_ptr, device_id);
}
#endif
......@@ -64,8 +64,8 @@ __C infiniopStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
return STATUS_SUCCESS;
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
#ifdef ENABLE_ASCEND_API
case INFINI_DEVICE_ASCEND: {
return deleteAscendHandle((infiniopAscendHandle_t) handle);
}
#endif
......
#include "matmul_aclnn.h"
MatmulAclnnDescriptor::MatmulAclnnDescriptor(Device _device) {
MatmulAclnnDescriptor::MatmulAclnnDescriptor(infiniDevice_t _device) {
device = _device;
device_id = 0;
executor = nullptr;
......@@ -8,35 +8,29 @@ MatmulAclnnDescriptor::MatmulAclnnDescriptor(Device _device) {
cDesc = new aclnnTensorDescriptor();
aDesc = new aclnnTensorDescriptor();
bDesc = new aclnnTensorDescriptor();
alpha = 1.0;
beta = 0;
mt = 1;
workspaceSize = 0;
}
infiniopStatus_t aclnnCreateMatmulDescriptor(AscendHandle_t handle,
infiniopStatus_t aclnnCreateMatmulDescriptor(infiniopAscendHandle_t handle,
MatmulAclnnDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
float alpha,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc,
float beta,
int8_t mt) {
DT dtype = c_desc->dt;
if (dtype != F16 && dtype != F32) {
return STATUS_BAD_TENSOR_DTYPE;
infiniDtype_t dtype = c_desc->dtype;
if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
}
*desc_ptr = new MatmulAclnnDescriptor(handle->device);
(*desc_ptr)->device_id = handle->device_id;
(*desc_ptr)->dtype = dtype;
(*desc_ptr)->mt = mt;
(*desc_ptr)->alpha = alpha;
(*desc_ptr)->beta = beta;
infiniopStatus_t *status = new infiniopStatus_t{STATUS_EXECUTION_FAILED};
auto info = new MatmulInfo(c_desc, a_desc, b_desc, status, false);
if (*status != STATUS_SUCCESS) {
return *status;
infiniopStatus_t status;
auto info = new MatmulInfo(c_desc, a_desc, b_desc, &status, false);
if (status != INFINIOP_STATUS_SUCCESS) {
return status;
}
(*desc_ptr)->info = info;
......@@ -44,15 +38,30 @@ infiniopStatus_t aclnnCreateMatmulDescriptor(AscendHandle_t handle,
auto &aDesc = (*desc_ptr)->aDesc;
auto &bDesc = (*desc_ptr)->bDesc;
// Treat A, B, C as 2D matrix, reuse aclnnTensorDescriptor for batched operation
CHECK_STATUS(cDesc->setDescriptor(toAclDataType(c_desc->dt), {info->c_matrix.rows, info->c_matrix.cols}, {info->c_matrix.row_stride, info->c_matrix.col_stride}), STATUS_SUCCESS);
CHECK_STATUS(aDesc->setDescriptor(toAclDataType(a_desc->dt), {info->a_matrix.rows, info->a_matrix.cols}, {info->a_matrix.row_stride, info->a_matrix.col_stride}), STATUS_SUCCESS);
CHECK_STATUS(bDesc->setDescriptor(toAclDataType(b_desc->dt), {info->b_matrix.rows, info->b_matrix.cols}, {info->b_matrix.row_stride, info->b_matrix.col_stride}), STATUS_SUCCESS);
CHECK_STATUS(cDesc->createTensor(), STATUS_SUCCESS);
CHECK_STATUS(aDesc->createTensor(), STATUS_SUCCESS);
CHECK_STATUS(bDesc->createTensor(), STATUS_SUCCESS);
// Treat A, B, C as 2D matrix, reuse aclnnTensorDescriptor for batched
// operation
CHECK_STATUS(cDesc->setDescriptor(
toAclDataType(c_desc->dtype),
{static_cast<int64_t>(info->c_matrix.rows),
static_cast<int64_t>(info->c_matrix.cols)},
{info->c_matrix.row_stride, info->c_matrix.col_stride}),
INFINIOP_STATUS_SUCCESS);
CHECK_STATUS(aDesc->setDescriptor(
toAclDataType(a_desc->dtype),
{static_cast<int64_t>(info->a_matrix.rows),
static_cast<int64_t>(info->a_matrix.cols)},
{info->a_matrix.row_stride, info->a_matrix.col_stride}),
INFINIOP_STATUS_SUCCESS);
CHECK_STATUS(bDesc->setDescriptor(
toAclDataType(b_desc->dtype),
{static_cast<int64_t>(info->b_matrix.rows),
static_cast<int64_t>(info->b_matrix.cols)},
{info->b_matrix.row_stride, info->b_matrix.col_stride}),
INFINIOP_STATUS_SUCCESS);
CHECK_STATUS(cDesc->createTensor(), INFINIOP_STATUS_SUCCESS);
CHECK_STATUS(aDesc->createTensor(), INFINIOP_STATUS_SUCCESS);
CHECK_STATUS(bDesc->createTensor(), INFINIOP_STATUS_SUCCESS);
auto &workspaceSize = (*desc_ptr)->workspaceSize;
auto &executor = (*desc_ptr)->executor;
......@@ -63,33 +72,31 @@ infiniopStatus_t aclnnCreateMatmulDescriptor(AscendHandle_t handle,
aclnnStatus ret;
int64_t transA = 0;
int64_t transB = 0;
// aclnnGemm support C = alpha * A @ B + beta * C
// see https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/aolapi/context/aclnnGemm.md
ret = aclnnGemmGetWorkspaceSize(ta, tb, tc, (*desc_ptr)->alpha, (*desc_ptr)->beta, transA, transB, tc,
// see
// https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/aolapi/context/aclnnGemm.md
// use alpha = 0.5, beta = 0.5 temporarily
ret = aclnnGemmGetWorkspaceSize(ta, tb, tc, 0.5f, 0.5f, transA, transB, tc,
(*desc_ptr)->mt, &workspaceSize, &executor);
CHECK_RET(ret == ACL_SUCCESS,
LOG_PRINT("aclnnGemmGetWorkspaceSize failed. ERROR: %d\n", ret);
return STATUS_EXECUTION_FAILED);
return INFINIOP_STATUS_INTERNAL_ERROR);
aclSetAclOpExecutorRepeatable(executor);
return STATUS_SUCCESS;
return INFINIOP_STATUS_SUCCESS;
}
infiniopStatus_t aclnnGetMatmulWorkspaceSize(MatmulAclnnDescriptor_t desc,
uint64_t *size) {
size_t *size) {
*size = desc->workspaceSize;
return STATUS_SUCCESS;
return INFINIOP_STATUS_SUCCESS;
}
infiniopStatus_t aclnnMatmul(MatmulAclnnDescriptor_t desc,
void *workspace,
uint64_t workspace_size,
void *c,
void const *a,
void const *b,
infiniopStatus_t aclnnMatmul(MatmulAclnnDescriptor_t desc, void *workspace,
size_t workspace_size, void *c, void const *a,
void const *b, float alpha, float beta,
void *stream) {
auto &cDesc = desc->cDesc;
auto &aDesc = desc->aDesc;
......@@ -101,30 +108,40 @@ infiniopStatus_t aclnnMatmul(MatmulAclnnDescriptor_t desc,
auto batch = desc->info->batch;
auto &executor = desc->executor;
auto &workspaceSize = desc->workspaceSize;
// Set runing on handle device
aclrtSetDevice(desc->device_id);
for (int i = 0; i < batch; i++) {
AclSetTensorAddr(executor, 0, ta, (char *) (a) + i * desc->info->a_matrix.stride * desc->dtype.size);
AclSetTensorAddr(executor, 1, tb, (char *) (b) + i * desc->info->b_matrix.stride * desc->dtype.size);
AclSetTensorAddr(executor, 2, tc, (char *) (c) + i * desc->info->c_matrix.stride * desc->dtype.size);
AclSetTensorAddr(executor, 3, tc, (char *) (c) + i * desc->info->c_matrix.stride * desc->dtype.size);
aclnnStatus ret = aclnnGemm(workspace,
workspaceSize,
executor,
stream);
size_t workspaceSize;
aclnnStatus ret;
ret = aclnnGemmGetWorkspaceSize(ta, tb, tc, alpha, beta, 0, 0, tc, desc->mt,
&workspaceSize, &(desc->executor));
CHECK_RET(ret == ACL_SUCCESS,
LOG_PRINT("aclnnGemmGetWorkspaceSize failed. ERROR: %d\n", ret);
return INFINIOP_STATUS_INTERNAL_ERROR);
if (workspace_size < workspaceSize) {
return INFINIOP_STATUS_INSUFFICIENT_WORKSPACE;
}
aclSetAclOpExecutorRepeatable(desc->executor);
for (size_t i = 0; i < batch; i++) {
AclSetTensorAddr(desc->executor, 0, ta,
(char *)(a) + i * desc->info->a_matrix.stride *
infini_sizeof(desc->dtype));
AclSetTensorAddr(desc->executor, 1, tb,
(char *)(b) + i * desc->info->b_matrix.stride *
infini_sizeof(desc->dtype));
AclSetTensorAddr(desc->executor, 2, tc,
(char *)(c) + i * desc->info->c_matrix.stride *
infini_sizeof(desc->dtype));
AclSetTensorAddr(desc->executor, 3, tc,
(char *)(c) + i * desc->info->c_matrix.stride *
infini_sizeof(desc->dtype));
ret = aclnnGemm(workspace, workspaceSize, desc->executor, stream);
CHECK_RET(ret == ACL_SUCCESS,
LOG_PRINT("aclnnGemm failed. ERROR: %d\n", ret);
return STATUS_EXECUTION_FAILED);
return INFINIOP_STATUS_INTERNAL_ERROR);
}
return STATUS_SUCCESS;
return INFINIOP_STATUS_SUCCESS;
}
infiniopStatus_t aclnnDestroyMatmulDescriptor(MatmulAclnnDescriptor_t desc) {
delete desc->cDesc;
delete desc->bDesc;
......@@ -133,5 +150,5 @@ infiniopStatus_t aclnnDestroyMatmulDescriptor(MatmulAclnnDescriptor_t desc) {
aclDestroyAclOpExecutor(desc->executor);
delete desc;
return STATUS_SUCCESS;
return INFINIOP_STATUS_SUCCESS;
}
#ifndef __ACLNN_MATMUL_H__
#define __ACLNN_MATMUL_H__
#include "../../../devices/ascend/ascend_handle.h"
#include "../../../devices/ascend/tensor_aclnn.h"
#include "../../utils.h"
#include "../blas.h"
#include "operators.h"
#include <acl/acl_base.h>
#include <aclnn/acl_meta.h>
#include <aclnnop/level2/aclnn_gemm.h>
#include <aclnnop/aclnn_matmul.h>
#include "matmul_aclnn_api.h"
struct MatmulAclnnDescriptor {
Device device;
infiniDevice_t device;
int device_id;
aclOpExecutor* executor;
MatmulInfo* info;
DT dtype;
infiniDtype_t dtype;
aclnnTensorDescriptor_t cDesc, aDesc, bDesc;
// cubeMathType
// see doc: https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha002/apiref/appdevgapi/context/aclnnBatchMatMul.md
float alpha;
float beta;
int8_t mt;
uint64_t workspaceSize;
size_t workspaceSize;
MatmulAclnnDescriptor(Device _device);
MatmulAclnnDescriptor(infiniDevice_t _device);
};
typedef struct MatmulAclnnDescriptor *MatmulAclnnDescriptor_t;
infiniopStatus_t aclnnCreateMatmulDescriptor(AscendHandle_t handle,
MatmulAclnnDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
float alpha,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc,
float beta,
int8_t cubeMathType);
infiniopStatus_t aclnnGetMatmulWorkspaceSize(MatmulAclnnDescriptor_t desc,
uint64_t *size);
infiniopStatus_t aclnnMatmul(MatmulAclnnDescriptor_t desc,
void *workspace,
uint64_t workspace_size,
void *c,
const void *a,
const void *b,
void *stream);
infiniopStatus_t aclnnDestroyMatmulDescriptor(MatmulAclnnDescriptor_t desc);
#endif
#ifndef __INFINIOP_MATMUL_ACLNN_API_H__
#define __INFINIOP_MATMUL_ACLNN_API_H__
#include "../../../devices/ascend/ascend_handle.h"
#include "infiniop/operator.h"
struct MatmulAclnnDescriptor;
typedef struct MatmulAclnnDescriptor *MatmulAclnnDescriptor_t;
infiniopStatus_t aclnnCreateMatmulDescriptor(infiniopAscendHandle_t handle,
MatmulAclnnDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc,
int8_t cubeMathType);
infiniopStatus_t aclnnGetMatmulWorkspaceSize(MatmulAclnnDescriptor_t desc,
size_t *size);
infiniopStatus_t aclnnMatmul(MatmulAclnnDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *c,
const void *a,
const void *b,
float alpha,
float beta,
void *stream);
infiniopStatus_t aclnnDestroyMatmulDescriptor(MatmulAclnnDescriptor_t desc);
#endif // __INFINIOP_MATMUL_ACLNN_API_H__
......@@ -47,7 +47,7 @@ typedef struct BlasMatrix {
*status = INFINIOP_STATUS_SUCCESS;
}
bool match_batch(int batch) const {
bool match_batch(size_t batch) const {
return this->batch == batch || this->batch == 1;
}
......
......@@ -10,8 +10,8 @@
#ifdef ENABLE_CAMBRICON_MLU
#include "bang/matmul_cnnl.h"
#endif
#ifdef ENABLE_ASCEND_NPU
#include "ascend/matmul_aclnn.h"
#ifdef ENABLE_ASCEND_API
#include "ascend/matmul_aclnn_api.h"
#endif
__C infiniopStatus_t infiniopCreateMatmulDescriptor(infiniopHandle_t handle,
......@@ -34,21 +34,19 @@ __C infiniopStatus_t infiniopCreateMatmulDescriptor(infiniopHandle_t handle,
return bangCreateMatmulDescriptor((BangHandle_t) handle, (MatmulBangDescriptor_t *) desc_ptr, c_desc, a_desc, b_desc);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
return aclnnCreateMatmulDescriptor((AscendHandle_t) handle,
(MatmulAclnnDescriptor_t *) desc_ptr,
c_desc,
a_desc,
b_desc,
1);
#ifdef ENABLE_ASCEND_API
case INFINI_DEVICE_ASCEND: {
return aclnnCreateMatmulDescriptor(
(infiniopAscendHandle_t)handle,
(MatmulAclnnDescriptor_t *)desc_ptr, c_desc, a_desc, b_desc, 1);
}
#endif
}
return INFINIOP_STATUS_BAD_DEVICE;
}
__C infiniopStatus_t infiniopGetMatmulWorkspaceSize(infiniopMatmulDescriptor_t desc, uint64_t *size) {
__C infiniopStatus_t
infiniopGetMatmulWorkspaceSize(infiniopMatmulDescriptor_t desc, size_t *size) {
switch (desc->device) {
#ifdef ENABLE_CPU_API
case INFINI_DEVICE_CPU:
......@@ -65,8 +63,8 @@ __C infiniopStatus_t infiniopGetMatmulWorkspaceSize(infiniopMatmulDescriptor_t d
return bangGetMatmulWorkspaceSize((MatmulBangDescriptor_t) desc, size);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
#ifdef ENABLE_ASCEND_API
case INFINI_DEVICE_ASCEND: {
return aclnnGetMatmulWorkspaceSize((MatmulAclnnDescriptor_t) desc,
size);
}
......@@ -75,7 +73,10 @@ __C infiniopStatus_t infiniopGetMatmulWorkspaceSize(infiniopMatmulDescriptor_t d
return INFINIOP_STATUS_BAD_DEVICE;
}
__C infiniopStatus_t infiniopMatmul(infiniopMatmulDescriptor_t desc, void *workspace, uint64_t workspace_size, void *c, void const *a, void const *b, float alpha, float beta, void *stream) {
__C infiniopStatus_t infiniopMatmul(infiniopMatmulDescriptor_t desc,
void *workspace, size_t workspace_size,
void *c, void const *a, void const *b,
float alpha, float beta, void *stream) {
switch (desc->device) {
#ifdef ENABLE_CPU_API
case INFINI_DEVICE_CPU:
......@@ -87,20 +88,14 @@ __C infiniopStatus_t infiniopMatmul(infiniopMatmulDescriptor_t desc, void *works
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangMatmul((MatmulBangDescriptor_t) desc, workspace, workspace_size, c, alpha, a, b, beta, stream);
return bangMatmul((MatmulBangDescriptor_t)desc, workspace,
workspace_size, c, a, b, alpha, beta, stream);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu:
return aclnnMatmul((MatmulAclnnDescriptor_t) desc,
workspace,
workspace_size,
c,
alpha,
a,
b,
beta,
stream);
#ifdef ENABLE_ASCEND_API
case INFINI_DEVICE_ASCEND:
return aclnnMatmul((MatmulAclnnDescriptor_t)desc, workspace,
workspace_size, c, a, b, alpha, beta, stream);
#endif
}
return INFINIOP_STATUS_BAD_DEVICE;
......@@ -123,8 +118,8 @@ __C infiniopStatus_t infiniopDestroyMatmulDescriptor(infiniopMatmulDescriptor_t
return bangDestroyMatmulDescriptor((MatmulBangDescriptor_t) desc);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case DevAscendNpu: {
#ifdef ENABLE_ASCEND_API
case INFINI_DEVICE_ASCEND: {
return aclnnDestroyMatmulDescriptor((MatmulAclnnDescriptor_t) desc);
}
#endif
......
......@@ -63,7 +63,6 @@ def open_lib():
assert (
library_path is not None
), f"Cannot find infiniop.dll or libinfiniop.so. Check if INFINI_ROOT is set correctly."
ctypes.CDLL(r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin\cudnn64_9.dll")
lib = ctypes.CDLL(library_path)
lib.infiniopCreateTensorDescriptor.argtypes = [
POINTER(infiniopTensorDescriptor_t),
......
......@@ -194,7 +194,7 @@ def test_cpu(lib, test_cases):
destroy_handle(lib, handle)
def test_cuda(lib, test_cases):
def test_nvidia(lib, test_cases):
device = InfiniDeviceEnum.NVIDIA
handle = create_handle(lib, device)
......@@ -227,7 +227,7 @@ def test_cuda(lib, test_cases):
destroy_handle(lib, handle)
def test_bang(lib, test_cases):
def test_cambricon(lib, test_cases):
import torch_mlu
device = InfiniDeviceEnum.CAMBRICON
handle = create_handle(lib, device)
......@@ -348,12 +348,12 @@ if __name__ == "__main__":
PROFILE = True
if args.cpu:
test_cpu(lib, test_cases)
if args.cuda:
test_cuda(lib, test_cases)
if args.bang:
test_bang(lib, test_cases)
if args.nvidia:
test_nvidia(lib, test_cases)
if args.cambricon:
test_cambricon(lib, test_cases)
if args.ascend:
test_ascend(lib, test_cases)
if not (args.cpu or args.cuda or args.bang or args.ascend):
if not (args.cpu or args.nvidia or args.cambricon or args.ascend):
test_cpu(lib, test_cases)
print("\033[92mTest passed!\033[0m")
......@@ -61,6 +61,7 @@ option_end()
if has_config("ascend-npu") then
add_defines("ENABLE_ASCEND_API")
includes("xmake/ascend.lua")
end
-- 沐曦
......@@ -126,7 +127,7 @@ target("infiniop")
add_deps("cambricon-mlu")
end
if has_config("ascend-npu") then
add_deps("ascend-npu")
add_deps("infiniop-ascend")
end
if has_config("metax-gpu") then
add_deps("metax-gpu")
......
add_defines("ENABLE_ASCEND_API")
local ASCEND_HOME = os.getenv("ASCEND_HOME")
local SOC_VERSION = os.getenv("SOC_VERSION")
-- Add include dirs
add_includedirs(ASCEND_HOME .. "/include")
add_includedirs(ASCEND_HOME .. "/include/aclnn")
add_linkdirs(ASCEND_HOME .. "/lib64")
add_links("libascendcl.so")
add_links("libnnopbase.so")
add_links("libopapi.so")
add_links("libruntime.so")
add_linkdirs(ASCEND_HOME .. "/../../driver/lib64/driver")
add_links("libascend_hal.so")
local builddir = string.format(
"%s/build/%s/%s/%s",
os.projectdir(),
get_config("plat"),
get_config("arch"),
get_config("mode")
)
rule("ascend-kernels")
before_link(function ()
local ascend_build_dir = path.join(os.projectdir(), "src/infiniop/devices/ascend")
os.cd(ascend_build_dir)
os.exec("make")
os.cp("$(projectdir)/src/infiniop/devices/ascend/build/lib/libascend_kernels.a", builddir.."/")
os.cd(os.projectdir())
end)
after_clean(function ()
local ascend_build_dir = path.join(os.projectdir(), "src/infiniop/devices/ascend")
os.cd(ascend_build_dir)
os.exec("make clean")
os.cd(os.projectdir())
os.rm(builddir.. "/libascend_kernels.a")
end)
rule_end()
target("infiniop-ascend")
-- Other configs
set_kind("static")
set_languages("cxx17")
on_install(function (target) end)
-- Add files
add_files("$(projectdir)/src/infiniop/devices/ascend/*.cc", "$(projectdir)/src/infiniop/ops/*/ascend/*.cc")
add_cxflags("-lstdc++ -Wall -Werror -fPIC")
-- Add operator
-- TODO: add it back after ascend-kernels is fixed
-- add_rules("ascend-kernels")
-- add_links(builddir.."/libascend_kernels.a")
target_end()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment