Commit 8e83bdca authored by PanZezhong's avatar PanZezhong
Browse files

feat: cambricon matmul, add fp32

parent 58c0de0c
#include "bang_handle.h"
#include "common_bang.h"
#include <memory>
#include "../pool.h"
infiniopStatus_t createBangHandle(infiniopBangHandle_t *handle_ptr, int device_id) {
unsigned int device_count;
......@@ -19,3 +21,8 @@ infiniopStatus_t createBangHandle(infiniopBangHandle_t *handle_ptr, int device_i
return INFINIOP_STATUS_SUCCESS;
}
infiniopStatus_t deleteBangHandle(infiniopBangHandle_t handle){
delete handle;
return INFINIOP_STATUS_SUCCESS;
}
#ifndef BANG_HANDLE_H
#define BANG_HANDLE_H
#include "../pool.h"
#include "cnnl.h"
#include "cnrt.h"
#include "infiniop/handle.h"
#include <memory>
struct InfiniopBangHandle {
infiniDevice_t device;
int device_id;
std::shared_ptr<Pool<cnnlHandle_t>> cnnl_handles;
};
struct InfiniopBangHandle;
typedef struct InfiniopBangHandle *infiniopBangHandle_t;
infiniopStatus_t createBangHandle(infiniopBangHandle_t *handle_ptr, int device_id);
template<typename T>
void use_cnnl(std::shared_ptr<Pool<cnnlHandle_t>> &pool, int device_id, cnrtQueue_t queue, T const &f) {
auto handle = pool->pop();
if (!handle) {
cnrtSetDevice(device_id);
cnnlCreate(&(*handle));
}
cnnlSetQueue(*handle, (cnrtQueue_t) queue);
f(*handle);
pool->push(std::move(*handle));
}
infiniopStatus_t createBangHandle(infiniopBangHandle_t *handle_ptr,
int device_id);
infiniopStatus_t deleteBangHandle(infiniopBangHandle_t handle);
#endif
#ifndef __COMMON_BANG_H__
#define __COMMON_BANG_H__
#include "../pool.h"
#include "bang_handle.h"
#include "cnnl.h"
#include "infinicore.h"
#include "cnrt.h"
#include "infiniop/tensor_descriptor.h"
#include <memory>
#include <vector>
const int NRAM_MAX_SIZE = 1024 * 256;//the maximum NRAM memory is 1024 * 768
const int GDRAM_MAX_SIZE = 1024 * 1024 * 1024;
// the maximum NRAM memory is 1024 * 768
#define NRAM_MAX_SIZE (1024 * 256)
// set cnnl tensor descriptor without strides11
inline void setCnnlTensor(cnnlTensorDescriptor_t desc, const TensorDescriptor *layout) {
std::vector<int> dims(layout->ndim);
for (uint64_t i = 0; i < layout->ndim; i++) {
dims[i] = static_cast<int>(layout->shape[i]);
}
cnnlSetTensorDescriptor(desc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_HALF,
dims.size(), dims.data());
}
#define GDRAM_MAX_SIZE (1024 * 1024 * 1024)
// set cnnl tensor descriptor with strides
inline void setCnnlTensorEx(cnnlTensorDescriptor_t desc, const TensorDescriptor *layout) {
std::vector<int> dim_size(layout->ndim), dim_stride(layout->ndim);
for (uint64_t i = 0; i < layout->ndim; i++) {
dim_size[i] = static_cast<int>(layout->shape[i]);
dim_stride[i] = static_cast<int>(layout->strides[i] / layout->dt.size);
}
cnnlSetTensorDescriptorEx(desc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_HALF,
dim_size.size(), dim_size.data(), dim_stride.data());
}
struct InfiniopBangHandle {
infiniDevice_t device;
int device_id;
std::shared_ptr<Pool<cnnlHandle_t>> cnnl_handles;
};
inline cnnlDataType_t cnnlDataTypeConvert(infiniDtype_t dataType) {
if (dtype_eq(dataType, INFINI_DTYPE_F32)) {
switch (dataType) {
case INFINI_DTYPE_F32:
return CNNL_DTYPE_FLOAT;
} else if (dtype_eq(dataType, INFINI_DTYPE_F64)) {
case INFINI_DTYPE_F64:
return CNNL_DTYPE_DOUBLE;
} else if (dtype_eq(dataType, INFINI_DTYPE_F16)) {
case INFINI_DTYPE_F16:
return CNNL_DTYPE_HALF;
} else if (dtype_eq(dataType, INFINI_DTYPE_I8)) {
case INFINI_DTYPE_I8:
return CNNL_DTYPE_INT8;
} else if (dtype_eq(dataType, INFINI_DTYPE_I32)) {
case INFINI_DTYPE_I32:
return CNNL_DTYPE_INT32;
} else if (dtype_eq(dataType, INFINI_DTYPE_U8)) {
case INFINI_DTYPE_U8:
return CNNL_DTYPE_UINT8;
} else if (dtype_eq(dataType, INFINI_DTYPE_BF16)) {
case INFINI_DTYPE_BF16:
return CNNL_DTYPE_BFLOAT16;
} else if (dtype_eq(dataType, INFINI_DTYPE_I64)) {
case INFINI_DTYPE_I64:
return CNNL_DTYPE_INT64;
} else {
default:
return CNNL_DTYPE_INVALID;
}
}
#endif// __COMMON_BANG_H__
template <typename T>
void use_cnnl(std::shared_ptr<Pool<cnnlHandle_t>> &pool, cnrtQueue_t queue,
T const &f) {
auto handle = pool->pop();
if (!handle) {
cnnlCreate(&(*handle));
}
cnnlSetQueue(*handle, (cnrtQueue_t)queue);
f(*handle);
pool->push(std::move(*handle));
}
template <typename T>
void use_cnnl(std::shared_ptr<Pool<cnnlHandle_t>> &pool, T const &f) {
auto handle = pool->pop();
if (!handle) {
cnnlCreate(&(*handle));
}
f(*handle);
pool->push(std::move(*handle));
}
// set cnnl tensor descriptor without strides11
inline void setCnnlTensor(cnnlTensorDescriptor_t desc,
const infiniopTensorDescriptor_t layout) {
std::vector<int> dims(layout->ndim);
for (size_t i = 0; i < layout->ndim; i++) {
dims[i] = static_cast<int>(layout->shape[i]);
}
cnnlSetTensorDescriptor(desc, CNNL_LAYOUT_ARRAY,
cnnlDataTypeConvert(layout->dtype), dims.size(),
dims.data());
}
// set cnnl tensor descriptor with strides
inline void setCnnlTensorEx(cnnlTensorDescriptor_t desc,
const infiniopTensorDescriptor_t layout) {
std::vector<int> dim_size(layout->ndim), dim_stride(layout->ndim);
for (size_t i = 0; i < layout->ndim; i++) {
dim_size[i] = static_cast<int>(layout->shape[i]);
dim_stride[i] = static_cast<int>(layout->strides[i]);
}
cnnlSetTensorDescriptorEx(
desc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(layout->dtype),
dim_size.size(), dim_size.data(), dim_stride.data());
}
#endif // __COMMON_BANG_H__
......@@ -5,7 +5,7 @@
#ifdef ENABLE_CUDA_API
#include "./cuda/cuda_handle.h"
#endif
#ifdef ENABLE_CAMBRICON_MLU
#ifdef ENABLE_CAMBRICON_API
#include "./bang/bang_handle.h"
#endif
#ifdef ENABLE_ASCEND_API
......@@ -32,7 +32,7 @@ __C infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr, infiniDe
}
#endif
#ifdef ENABLE_CAMBRICON_API
case DevCambriconMlu: {
case INFINI_DEVICE_CAMBRICON: {
return createBangHandle((infiniopBangHandle_t *) handle_ptr, device_id);
}
#endif
......@@ -58,10 +58,9 @@ __C infiniopStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
return deleteCudaHandle((infiniopCudaHandle_t) handle);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
delete (infiniopBangHandle_t) handle;
return STATUS_SUCCESS;
#ifdef ENABLE_CAMBRICON_API
case INFINI_DEVICE_CAMBRICON: {
return deleteBangHandle((infiniopBangHandle_t) handle);
}
#endif
#ifdef ENABLE_ASCEND_API
......
#include "matmul_cnnl.h"
#include "../../../devices/bang/bang_handle.h"
#include "../../../devices/bang/common_bang.h"
#include "../../utils.h"
#include "cnrt.h"
infiniopStatus_t bangCreateMatmulDescriptor(BangHandle_t handle,
MatmulBangDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
float alpha,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc,
float beta) {
infiniopStatus_t *status = new infiniopStatus_t{STATUS_EXECUTION_FAILED};
auto info = MatmulInfo(c_desc, a_desc, b_desc, status, false);
if (*status != STATUS_SUCCESS) {
return *status;
#include "matmul_cnnl_api.h"
infiniopStatus_t bangCreateMatmulDescriptor(
infiniopBangHandle_t handle, infiniopMatmulBangDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc) {
infiniopStatus_t status;
auto info = MatmulInfo(c_desc, a_desc, b_desc, &status, false);
if (status != INFINIOP_STATUS_SUCCESS) {
return status;
}
cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
cnnlCreateTensorDescriptor(&aDesc);
cnnlCreateTensorDescriptor(&bDesc);
cnnlCreateTensorDescriptor(&cDesc);
setMatrixTensorEx(aDesc, info.a_matrix);
setMatrixTensorEx(bDesc, info.b_matrix);
setMatrixTensorEx(cDesc, info.c_matrix);
setMatrixTensorEx(aDesc, info.a_matrix, a_desc->dtype);
setMatrixTensorEx(bDesc, info.b_matrix, b_desc->dtype);
setMatrixTensorEx(cDesc, info.c_matrix, c_desc->dtype);
cnnlMatMulDescriptor_t opDesc;
cnnlMatMulAlgo_t algo;
......@@ -33,28 +30,37 @@ infiniopStatus_t bangCreateMatmulDescriptor(BangHandle_t handle,
int32_t use_stride = true;
cnnlSetMatMulDescAttr(opDesc, CNNL_MATMUL_USE_STRIDE, &use_stride,
sizeof(int32_t));
*desc_ptr = new MatmulBangDescriptor{
handle->device,
int count = 0;
use_cnnl(handle->cnnl_handles, [&](cnnlHandle_t _handle) {
cnnlGetBatchMatMulAlgoHeuristic(_handle, opDesc, aDesc, bDesc, cDesc,
NULL, 1, &algoResult, &count);
});
size_t workspace_size;
cnnlGetBatchMatMulHeuristicResult(algoResult, algo, &workspace_size);
*desc_ptr = new InfiniopMatmulBangDescriptor{handle->device,
handle->device_id,
info,
alpha,
beta,
c_desc->dt,
c_desc->dtype,
handle->cnnl_handles,
aDesc,
bDesc,
cDesc,
opDesc,
algo,
algoResult};
return STATUS_SUCCESS;
algoResult,
workspace_size};
return INFINIOP_STATUS_SUCCESS;
}
infiniopStatus_t bangGetMatmulWorkspaceSize(MatmulBangDescriptor_t desc, uint64_t *size) {
*size = 0;
return STATUS_SUCCESS;
infiniopStatus_t bangGetMatmulWorkspaceSize(infiniopMatmulBangDescriptor_t desc,
size_t *size) {
*size = desc->workspace_size;
return INFINIOP_STATUS_SUCCESS;
}
infiniopStatus_t bangDestroyMatmulDescriptor(MatmulBangDescriptor_t desc) {
infiniopStatus_t
bangDestroyMatmulDescriptor(infiniopMatmulBangDescriptor_t desc) {
desc->cnnl_handles = nullptr;
cnnlDestroyTensorDescriptor(desc->aDesc);
cnnlDestroyTensorDescriptor(desc->bDesc);
......@@ -63,41 +69,32 @@ infiniopStatus_t bangDestroyMatmulDescriptor(MatmulBangDescriptor_t desc) {
cnnlMatMulAlgoDestroy(desc->algo);
cnnlDestroyMatMulHeuristicResult(desc->algoResult);
delete desc;
return STATUS_SUCCESS;
return INFINIOP_STATUS_SUCCESS;
}
void matmul_cnnl_f16(MatmulBangDescriptor_t desc, void *workspace, void *c, float beta, void const *a, void const *b, float alpha, void *stream) {
void matmul_cnnl(infiniopMatmulBangDescriptor_t desc, void *workspace, void *c,
float beta, void const *a, void const *b, float alpha,
void *stream) {
auto info = desc->info;
if (info.is_transed) {
std::swap(a, b);
}
use_cnnl(desc->cnnl_handles, desc->device_id, (cnrtQueue_t) stream,
[&](cnnlHandle_t handle) {
int count = 0;
cnnlGetBatchMatMulAlgoHeuristic(handle, desc->opDesc, desc->aDesc,
desc->bDesc, desc->cDesc,
NULL, 1, &desc->algoResult, &count);
size_t wsSize;
cnnlGetBatchMatMulHeuristicResult(desc->algoResult, desc->algo, &wsSize);
cnrtMalloc(&workspace, wsSize);
cnnlBatchMatMulBCast_v2(handle, desc->opDesc, desc->algo,
&alpha, desc->aDesc, a,
desc->bDesc, b,
&beta, desc->cDesc, c,
workspace, wsSize);
use_cnnl(desc->cnnl_handles, (cnrtQueue_t)stream, [&](cnnlHandle_t handle) {
cnnlBatchMatMulBCast_v2(handle, desc->opDesc, desc->algo, &alpha,
desc->aDesc, a, desc->bDesc, b, &beta,
desc->cDesc, c, workspace,
desc->workspace_size);
});
}
infiniopStatus_t bangMatmul(MatmulBangDescriptor_t desc, void *workspace, uint64_t workspace_size, void *c, void const *a, void const *b, void *stream) {
if (cnrtSetDevice(desc->device_id) != cnrtSuccess) {
return STATUS_BAD_DEVICE;
}
float alpha = desc->alpha;
float beta = desc->beta;
if (dtype_eq(desc->dtype, F16)) {
matmul_cnnl_f16(desc, workspace, c, beta, a, b, alpha, stream);
infiniopStatus_t bangMatmul(infiniopMatmulBangDescriptor_t desc,
void *workspace, size_t workspace_size, void *c,
void const *a, void const *b, float alpha,
float beta, void *stream) {
if (desc->dtype == INFINI_DTYPE_F16 || desc->dtype == INFINI_DTYPE_F32) {
matmul_cnnl(desc, workspace, c, beta, a, b, alpha, stream);
cnrtQueueSync((cnrtQueue_t)stream);
return STATUS_SUCCESS;
return INFINIOP_STATUS_SUCCESS;
}
return STATUS_BAD_TENSOR_DTYPE;
return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
}
#ifndef __CNNL_MATMUL_H__
#define __CNNL_MATMUL_H__
#include "../../../devices/bang/bang_handle.h"
#include "../../../devices/bang/common_bang.h"
#include "../blas.h"
#include "cnnl.h"
#include "cnnl_extra.h"
#include "operators.h"
struct MatmulBangDescriptor {
Device device;
struct InfiniopMatmulBangDescriptor {
infiniDevice_t device;
int device_id;
MatmulInfo info;
float alpha;
float beta;
DT dtype;
infiniDtype_t dtype;
std::shared_ptr<Pool<cnnlHandle_t>> cnnl_handles;
cnnlTensorDescriptor_t aDesc;
cnnlTensorDescriptor_t bDesc;
......@@ -20,24 +16,12 @@ struct MatmulBangDescriptor {
cnnlMatMulDescriptor_t opDesc;
cnnlMatMulAlgo_t algo;
cnnlMatMulHeuristicResult_t algoResult;
size_t workspace_size;
};
typedef struct MatmulBangDescriptor *MatmulBangDescriptor_t;
infiniopStatus_t bangCreateMatmulDescriptor(BangHandle_t handle,
MatmulBangDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
float alpha,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc,
float beta);
infiniopStatus_t bangGetMatmulWorkspaceSize(MatmulBangDescriptor_t desc, uint64_t *size);
infiniopStatus_t bangMatmul(MatmulBangDescriptor_t desc, void *workspace, uint64_t workspace_size, void *c, void const *a, void const *b, void *stream);
infiniopStatus_t bangDestroyMatmulDescriptor(MatmulBangDescriptor_t desc);
inline void setMatrixTensorEx(cnnlTensorDescriptor_t desc, const BlasMatrix &matrix, bool trans = false) {
inline void setMatrixTensorEx(cnnlTensorDescriptor_t desc,
const BlasMatrix &matrix, infiniDtype_t dtype,
bool trans = false) {
int ndim = matrix.ndim;
int batch = matrix.batch;
int stride = static_cast<int>(matrix.stride);
......@@ -49,15 +33,16 @@ inline void setMatrixTensorEx(cnnlTensorDescriptor_t desc, const BlasMatrix &mat
if (ndim == 3) {
std::vector<int> dim_size = {batch, rows, cols};
std::vector<int> dim_stride = {stride, row_stride, col_stride};
cnnlSetTensorDescriptorEx(desc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_HALF,
dim_size.size(), dim_size.data(), dim_stride.data());
cnnlSetTensorDescriptorEx(desc, CNNL_LAYOUT_ARRAY,
cnnlDataTypeConvert(dtype), dim_size.size(),
dim_size.data(), dim_stride.data());
} else if (ndim == 2) {
std::vector<int> dim_size = {rows, cols};
std::vector<int> dim_stride = {row_stride, col_stride};
cnnlSetTensorDescriptorEx(desc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_HALF,
dim_size.size(), dim_size.data(), dim_stride.data());
cnnlSetTensorDescriptorEx(desc, CNNL_LAYOUT_ARRAY,
cnnlDataTypeConvert(dtype), dim_size.size(),
dim_size.data(), dim_stride.data());
}
}
#endif// __CNNL_MATMUL_H__
#endif // __CNNL_MATMUL_H__
#ifndef __CNNL_MATMUL_API_H__
#define __CNNL_MATMUL_API_H__
#include "../../../devices/bang/bang_handle.h"
#include "infiniop/operator.h"
struct InfiniopMatmulBangDescriptor;
typedef struct InfiniopMatmulBangDescriptor *infiniopMatmulBangDescriptor_t;
infiniopStatus_t bangCreateMatmulDescriptor(
infiniopBangHandle_t handle, infiniopMatmulBangDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc);
infiniopStatus_t bangGetMatmulWorkspaceSize(infiniopMatmulBangDescriptor_t desc,
size_t *size);
infiniopStatus_t bangMatmul(infiniopMatmulBangDescriptor_t desc,
void *workspace, size_t workspace_size, void *c,
void const *a, void const *b, float alpha,
float beta, void *stream);
infiniopStatus_t
bangDestroyMatmulDescriptor(infiniopMatmulBangDescriptor_t desc);
#endif
......@@ -3,10 +3,9 @@
#include "../../utils.h"
#include <cmath>
infiniopStatus_t cpuCreateMatmulDescriptor(infiniopCpuHandle_t handle,
MatmulCpuDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopStatus_t cpuCreateMatmulDescriptor(
infiniopCpuHandle_t handle, infiniopMatmulCpuDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc) {
infiniDtype_t dtype = c_desc->dtype;
......@@ -20,26 +19,27 @@ infiniopStatus_t cpuCreateMatmulDescriptor(infiniopCpuHandle_t handle,
return status;
}
*desc_ptr = new MatmulCpuDescriptor{
INFINI_DEVICE_CPU,
dtype,
info};
*desc_ptr = new MatmulCpuDescriptor{INFINI_DEVICE_CPU, dtype, info};
return INFINIOP_STATUS_SUCCESS;
}
infiniopStatus_t cpuGetMatmulWorkspaceSize(MatmulCpuDescriptor_t desc, uint64_t *size) {
infiniopStatus_t cpuGetMatmulWorkspaceSize(infiniopMatmulCpuDescriptor_t desc,
uint64_t *size) {
*size = 0;
return INFINIOP_STATUS_SUCCESS;
}
infiniopStatus_t cpuDestroyMatmulDescriptor(MatmulCpuDescriptor_t desc) {
infiniopStatus_t
cpuDestroyMatmulDescriptor(infiniopMatmulCpuDescriptor_t desc) {
delete desc;
return INFINIOP_STATUS_SUCCESS;
}
template<typename Tdata>
infiniopStatus_t matmul_cpu(MatmulCpuDescriptor_t desc, void *c, float beta, void const *a, void const *b, float alpha) {
template <typename Tdata>
infiniopStatus_t matmul_cpu(infiniopMatmulCpuDescriptor_t desc, void *c,
float beta, void const *a, void const *b,
float alpha) {
auto info = desc->info;
if (info.is_transed) {
......@@ -49,11 +49,20 @@ infiniopStatus_t matmul_cpu(MatmulCpuDescriptor_t desc, void *c, float beta, voi
for (int i = 0; i < info.batch; ++i) {
for (int m_ = 0; m_ < info.m; ++m_) {
for (int n_ = 0; n_ < info.n; ++n_) {
auto c_ = reinterpret_cast<Tdata *>(c) + i * info.c_matrix.stride + m_ * info.c_matrix.row_stride + n_ * info.c_matrix.col_stride;
auto c_ = reinterpret_cast<Tdata *>(c) +
i * info.c_matrix.stride +
m_ * info.c_matrix.row_stride +
n_ * info.c_matrix.col_stride;
float sum = 0;
for (int k_ = 0; k_ < info.k; ++k_) {
auto a_ = reinterpret_cast<Tdata const *>(a) + i * info.a_matrix.stride + m_ * info.a_matrix.row_stride + k_ * info.a_matrix.col_stride;
auto b_ = reinterpret_cast<Tdata const *>(b) + i * info.b_matrix.stride + n_ * info.b_matrix.col_stride + k_ * info.b_matrix.row_stride;
auto a_ = reinterpret_cast<Tdata const *>(a) +
i * info.a_matrix.stride +
m_ * info.a_matrix.row_stride +
k_ * info.a_matrix.col_stride;
auto b_ = reinterpret_cast<Tdata const *>(b) +
i * info.b_matrix.stride +
n_ * info.b_matrix.col_stride +
k_ * info.b_matrix.row_stride;
if constexpr (std::is_same<Tdata, uint16_t>::value) {
sum += f16_to_f32(*a_) * f16_to_f32(*b_);
} else {
......@@ -75,14 +84,9 @@ infiniopStatus_t matmul_cpu(MatmulCpuDescriptor_t desc, void *c, float beta, voi
return INFINIOP_STATUS_SUCCESS;
}
infiniopStatus_t cpuMatmul(MatmulCpuDescriptor_t desc,
void *workspace,
uint64_t workspace_size,
void *c,
void const *a,
void const *b,
float alpha,
float beta) {
infiniopStatus_t cpuMatmul(infiniopMatmulCpuDescriptor_t desc, void *workspace,
uint64_t workspace_size, void *c, void const *a,
void const *b, float alpha, float beta) {
if (desc->dtype == INFINI_DTYPE_F16) {
return matmul_cpu<uint16_t>(desc, c, beta, a, b, alpha);
}
......
......@@ -11,25 +11,20 @@ typedef struct MatmulCpuDescriptor {
MatmulInfo info;
} MatmulCpuDescriptor;
typedef struct MatmulCpuDescriptor *MatmulCpuDescriptor_t;
typedef struct MatmulCpuDescriptor *infiniopMatmulCpuDescriptor_t;
infiniopStatus_t cpuCreateMatmulDescriptor(infiniopCpuHandle_t handle,
MatmulCpuDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopStatus_t cpuCreateMatmulDescriptor(
infiniopCpuHandle_t handle, infiniopMatmulCpuDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc);
infiniopStatus_t cpuGetMatmulWorkspaceSize(MatmulCpuDescriptor_t desc, uint64_t *size);
infiniopStatus_t cpuGetMatmulWorkspaceSize(infiniopMatmulCpuDescriptor_t desc,
uint64_t *size);
infiniopStatus_t cpuMatmul(MatmulCpuDescriptor_t desc,
void *workspace,
uint64_t workspace_size,
void *c,
void const *a,
void const *b,
float alpha,
float beta);
infiniopStatus_t cpuMatmul(infiniopMatmulCpuDescriptor_t desc, void *workspace,
uint64_t workspace_size, void *c, void const *a,
void const *b, float alpha, float beta);
infiniopStatus_t cpuDestroyMatmulDescriptor(MatmulCpuDescriptor_t desc);
infiniopStatus_t cpuDestroyMatmulDescriptor(infiniopMatmulCpuDescriptor_t desc);
#endif// __INFINIOP_MATMUL_CPU_H__
#endif // __INFINIOP_MATMUL_CPU_H__
......@@ -7,38 +7,43 @@
#ifdef ENABLE_CUDA_API
#include "cuda/matmul_cuda_api.h"
#endif
#ifdef ENABLE_CAMBRICON_MLU
#include "bang/matmul_cnnl.h"
#ifdef ENABLE_CAMBRICON_API
#include "bang/matmul_cnnl_api.h"
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/matmul_aclnn_api.h"
#endif
__C infiniopStatus_t infiniopCreateMatmulDescriptor(infiniopHandle_t handle,
infiniopMatmulDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
__C infiniopStatus_t infiniopCreateMatmulDescriptor(
infiniopHandle_t handle, infiniopMatmulDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc) {
switch (handle->device) {
#ifdef ENABLE_CPU_API
case INFINI_DEVICE_CPU:
return cpuCreateMatmulDescriptor((infiniopCpuHandle_t) handle, (MatmulCpuDescriptor_t *) desc_ptr, c_desc, a_desc, b_desc);
return cpuCreateMatmulDescriptor(
(infiniopCpuHandle_t)handle,
(infiniopMatmulCpuDescriptor_t *)desc_ptr, c_desc, a_desc, b_desc);
#endif
#ifdef ENABLE_CUDA_API
case INFINI_DEVICE_NVIDIA: {
return cudaCreateMatmulDescriptor((infiniopCudaHandle_t) handle, (infiniopMatmulCudaDescriptor_t *) desc_ptr, c_desc, a_desc, b_desc);
return cudaCreateMatmulDescriptor(
(infiniopCudaHandle_t)handle,
(infiniopMatmulCudaDescriptor_t *)desc_ptr, c_desc, a_desc, b_desc);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangCreateMatmulDescriptor((BangHandle_t) handle, (MatmulBangDescriptor_t *) desc_ptr, c_desc, a_desc, b_desc);
#ifdef ENABLE_CAMBRICON_API
case INFINI_DEVICE_CAMBRICON: {
return bangCreateMatmulDescriptor(
(infiniopBangHandle_t)handle,
(infiniopMatmulBangDescriptor_t *)desc_ptr, c_desc, a_desc, b_desc);
}
#endif
#ifdef ENABLE_ASCEND_API
case INFINI_DEVICE_ASCEND: {
return aclnnCreateMatmulDescriptor(
(infiniopAscendHandle_t)handle,
(MatmulAclnnDescriptor_t *)desc_ptr, c_desc, a_desc, b_desc, 1);
return aclnnCreateMatmulDescriptor((infiniopAscendHandle_t)handle,
(MatmulAclnnDescriptor_t *)desc_ptr,
c_desc, a_desc, b_desc, 1);
}
#endif
}
......@@ -50,23 +55,25 @@ infiniopGetMatmulWorkspaceSize(infiniopMatmulDescriptor_t desc, size_t *size) {
switch (desc->device) {
#ifdef ENABLE_CPU_API
case INFINI_DEVICE_CPU:
return cpuGetMatmulWorkspaceSize((MatmulCpuDescriptor_t) desc, size);
return cpuGetMatmulWorkspaceSize((infiniopMatmulCpuDescriptor_t)desc,
size);
#endif
#ifdef ENABLE_CUDA_API
case INFINI_DEVICE_NVIDIA: {
return cudaGetMatmulWorkspaceSize((infiniopMatmulCudaDescriptor_t) desc, size);
return cudaGetMatmulWorkspaceSize((infiniopMatmulCudaDescriptor_t)desc,
size);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangGetMatmulWorkspaceSize((MatmulBangDescriptor_t) desc, size);
#ifdef ENABLE_CAMBRICON_API
case INFINI_DEVICE_CAMBRICON: {
return bangGetMatmulWorkspaceSize((infiniopMatmulBangDescriptor_t)desc,
size);
}
#endif
#ifdef ENABLE_ASCEND_API
case INFINI_DEVICE_ASCEND: {
return aclnnGetMatmulWorkspaceSize((MatmulAclnnDescriptor_t) desc,
size);
return aclnnGetMatmulWorkspaceSize((MatmulAclnnDescriptor_t)desc, size);
}
#endif
}
......@@ -80,15 +87,17 @@ __C infiniopStatus_t infiniopMatmul(infiniopMatmulDescriptor_t desc,
switch (desc->device) {
#ifdef ENABLE_CPU_API
case INFINI_DEVICE_CPU:
return cpuMatmul((MatmulCpuDescriptor_t) desc, workspace, workspace_size, c, a, b, alpha, beta);
return cpuMatmul((infiniopMatmulCpuDescriptor_t)desc, workspace,
workspace_size, c, a, b, alpha, beta);
#endif
#ifdef ENABLE_CUDA_API
case INFINI_DEVICE_NVIDIA:
return cudaMatmul((infiniopMatmulCudaDescriptor_t) desc, workspace, workspace_size, c, a, b, alpha, beta, stream);
return cudaMatmul((infiniopMatmulCudaDescriptor_t)desc, workspace,
workspace_size, c, a, b, alpha, beta, stream);
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangMatmul((MatmulBangDescriptor_t)desc, workspace,
#ifdef ENABLE_CAMBRICON_API
case INFINI_DEVICE_CAMBRICON: {
return bangMatmul((infiniopMatmulBangDescriptor_t)desc, workspace,
workspace_size, c, a, b, alpha, beta, stream);
}
#endif
......@@ -101,26 +110,28 @@ __C infiniopStatus_t infiniopMatmul(infiniopMatmulDescriptor_t desc,
return INFINIOP_STATUS_BAD_DEVICE;
}
__C infiniopStatus_t infiniopDestroyMatmulDescriptor(infiniopMatmulDescriptor_t desc) {
__C infiniopStatus_t
infiniopDestroyMatmulDescriptor(infiniopMatmulDescriptor_t desc) {
switch (desc->device) {
#ifdef ENABLE_CPU_API
case INFINI_DEVICE_CPU:
return cpuDestroyMatmulDescriptor((MatmulCpuDescriptor_t) desc);
return cpuDestroyMatmulDescriptor((infiniopMatmulCpuDescriptor_t)desc);
#endif
#ifdef ENABLE_CUDA_API
case INFINI_DEVICE_NVIDIA: {
return cudaDestroyMatmulDescriptor((infiniopMatmulCudaDescriptor_t) desc);
return cudaDestroyMatmulDescriptor(
(infiniopMatmulCudaDescriptor_t)desc);
}
#endif
#ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: {
return bangDestroyMatmulDescriptor((MatmulBangDescriptor_t) desc);
#ifdef ENABLE_CAMBRICON_API
case INFINI_DEVICE_CAMBRICON: {
return bangDestroyMatmulDescriptor((infiniopMatmulBangDescriptor_t)desc);
}
#endif
#ifdef ENABLE_ASCEND_API
case INFINI_DEVICE_ASCEND: {
return aclnnDestroyMatmulDescriptor((MatmulAclnnDescriptor_t) desc);
return aclnnDestroyMatmulDescriptor((MatmulAclnnDescriptor_t)desc);
}
#endif
}
......
......@@ -50,6 +50,7 @@ option_end()
if has_config("cambricon-mlu") then
add_defines("ENABLE_CAMBRICON_API")
includes("xmake/bang.lua")
end
-- 华为昇腾
......@@ -124,7 +125,7 @@ target("infiniop")
end
if has_config("cambricon-mlu") then
add_deps("cambricon-mlu")
add_deps("infiniop-cambricon")
end
if has_config("ascend-npu") then
add_deps("infiniop-ascend")
......@@ -143,5 +144,4 @@ target("infiniop")
add_installfiles("include/infiniop/*.h", {prefixdir = "include/infiniop"})
add_installfiles("include/infiniop.h", {prefixdir = "include"})
add_installfiles("include/infinicore.h", {prefixdir = "include"})
target_end()
local NEUWARE_HOME = os.getenv("NEUWARE_HOME") or "/usr/local/neuware"
add_includedirs(path.join(NEUWARE_HOME, "include"))
add_linkdirs(path.join(NEUWARE_HOME, "lib64"))
add_linkdirs(path.join(NEUWARE_HOME, "lib"))
add_links("libcnrt.so")
add_links("libcnnl.so")
add_links("libcnnl_extra.so")
add_links("libcnpapi.so")
rule("mlu")
set_extensions(".mlu")
on_load(function (target)
target:add("includedirs", path.join(os.projectdir(), "include"))
end)
on_build_file(function (target, sourcefile)
local objectfile = target:objectfile(sourcefile)
os.mkdir(path.directory(objectfile))
local cc = "cncc"
local includedirs = table.concat(target:get("includedirs"), " ")
local args = {"-c", sourcefile, "-o", objectfile, "--bang-mlu-arch=mtp_592", "-O3", "-fPIC", "-Wall", "-Werror", "-std=c++17", "-pthread"}
for _, includedir in ipairs(target:get("includedirs")) do
table.insert(args, "-I" .. includedir)
end
os.execv(cc, args)
table.insert(target:objectfiles(), objectfile)
end)
rule_end()
local src_dir = path.join(os.projectdir(), "src", "infiniop")
target("infiniop-cambricon")
set_kind("static")
on_install(function (target) end)
set_languages("cxx17")
add_files(src_dir.."/devices/bang/*.cc", src_dir.."/ops/*/bang/*.cc")
local mlu_files = os.files(src_dir .. "/ops/*/bang/*.mlu")
if #mlu_files > 0 then
add_files(mlu_files, {rule = "mlu"})
end
add_cxflags("-lstdc++ -Wall -Werror -fPIC")
target_end()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment