Commit 8e83bdca authored by PanZezhong's avatar PanZezhong
Browse files

feat: cambricon matmul, add fp32

parent 58c0de0c
#include "bang_handle.h" #include "common_bang.h"
#include <memory>
#include "../pool.h"
infiniopStatus_t createBangHandle(infiniopBangHandle_t *handle_ptr, int device_id) { infiniopStatus_t createBangHandle(infiniopBangHandle_t *handle_ptr, int device_id) {
unsigned int device_count; unsigned int device_count;
...@@ -19,3 +21,8 @@ infiniopStatus_t createBangHandle(infiniopBangHandle_t *handle_ptr, int device_i ...@@ -19,3 +21,8 @@ infiniopStatus_t createBangHandle(infiniopBangHandle_t *handle_ptr, int device_i
return INFINIOP_STATUS_SUCCESS; return INFINIOP_STATUS_SUCCESS;
} }
infiniopStatus_t deleteBangHandle(infiniopBangHandle_t handle){
delete handle;
return INFINIOP_STATUS_SUCCESS;
}
#ifndef BANG_HANDLE_H #ifndef BANG_HANDLE_H
#define BANG_HANDLE_H #define BANG_HANDLE_H
#include "../pool.h"
#include "cnnl.h"
#include "cnrt.h"
#include "infiniop/handle.h" #include "infiniop/handle.h"
#include <memory>
struct InfiniopBangHandle { struct InfiniopBangHandle;
infiniDevice_t device;
int device_id;
std::shared_ptr<Pool<cnnlHandle_t>> cnnl_handles;
};
typedef struct InfiniopBangHandle *infiniopBangHandle_t; typedef struct InfiniopBangHandle *infiniopBangHandle_t;
infiniopStatus_t createBangHandle(infiniopBangHandle_t *handle_ptr, int device_id); infiniopStatus_t createBangHandle(infiniopBangHandle_t *handle_ptr,
int device_id);
template<typename T> infiniopStatus_t deleteBangHandle(infiniopBangHandle_t handle);
void use_cnnl(std::shared_ptr<Pool<cnnlHandle_t>> &pool, int device_id, cnrtQueue_t queue, T const &f) {
auto handle = pool->pop();
if (!handle) {
cnrtSetDevice(device_id);
cnnlCreate(&(*handle));
}
cnnlSetQueue(*handle, (cnrtQueue_t) queue);
f(*handle);
pool->push(std::move(*handle));
}
#endif #endif
#ifndef __COMMON_BANG_H__ #ifndef __COMMON_BANG_H__
#define __COMMON_BANG_H__ #define __COMMON_BANG_H__
#include "../pool.h"
#include "bang_handle.h"
#include "cnnl.h" #include "cnnl.h"
#include "infinicore.h" #include "cnrt.h"
#include "infiniop/tensor_descriptor.h"
#include <memory>
#include <vector> #include <vector>
const int NRAM_MAX_SIZE = 1024 * 256;//the maximum NRAM memory is 1024 * 768 // the maximum NRAM memory is 1024 * 768
const int GDRAM_MAX_SIZE = 1024 * 1024 * 1024; #define NRAM_MAX_SIZE (1024 * 256)
// set cnnl tensor descriptor without strides11 #define GDRAM_MAX_SIZE (1024 * 1024 * 1024)
inline void setCnnlTensor(cnnlTensorDescriptor_t desc, const TensorDescriptor *layout) {
std::vector<int> dims(layout->ndim);
for (uint64_t i = 0; i < layout->ndim; i++) {
dims[i] = static_cast<int>(layout->shape[i]);
}
cnnlSetTensorDescriptor(desc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_HALF,
dims.size(), dims.data());
}
// set cnnl tensor descriptor with strides struct InfiniopBangHandle {
inline void setCnnlTensorEx(cnnlTensorDescriptor_t desc, const TensorDescriptor *layout) { infiniDevice_t device;
std::vector<int> dim_size(layout->ndim), dim_stride(layout->ndim); int device_id;
for (uint64_t i = 0; i < layout->ndim; i++) { std::shared_ptr<Pool<cnnlHandle_t>> cnnl_handles;
dim_size[i] = static_cast<int>(layout->shape[i]); };
dim_stride[i] = static_cast<int>(layout->strides[i] / layout->dt.size);
}
cnnlSetTensorDescriptorEx(desc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_HALF,
dim_size.size(), dim_size.data(), dim_stride.data());
}
inline cnnlDataType_t cnnlDataTypeConvert(infiniDtype_t dataType) { inline cnnlDataType_t cnnlDataTypeConvert(infiniDtype_t dataType) {
if (dtype_eq(dataType, INFINI_DTYPE_F32)) { switch (dataType) {
case INFINI_DTYPE_F32:
return CNNL_DTYPE_FLOAT; return CNNL_DTYPE_FLOAT;
} else if (dtype_eq(dataType, INFINI_DTYPE_F64)) { case INFINI_DTYPE_F64:
return CNNL_DTYPE_DOUBLE; return CNNL_DTYPE_DOUBLE;
} else if (dtype_eq(dataType, INFINI_DTYPE_F16)) { case INFINI_DTYPE_F16:
return CNNL_DTYPE_HALF; return CNNL_DTYPE_HALF;
} else if (dtype_eq(dataType, INFINI_DTYPE_I8)) { case INFINI_DTYPE_I8:
return CNNL_DTYPE_INT8; return CNNL_DTYPE_INT8;
} else if (dtype_eq(dataType, INFINI_DTYPE_I32)) { case INFINI_DTYPE_I32:
return CNNL_DTYPE_INT32; return CNNL_DTYPE_INT32;
} else if (dtype_eq(dataType, INFINI_DTYPE_U8)) { case INFINI_DTYPE_U8:
return CNNL_DTYPE_UINT8; return CNNL_DTYPE_UINT8;
} else if (dtype_eq(dataType, INFINI_DTYPE_BF16)) { case INFINI_DTYPE_BF16:
return CNNL_DTYPE_BFLOAT16; return CNNL_DTYPE_BFLOAT16;
} else if (dtype_eq(dataType, INFINI_DTYPE_I64)) { case INFINI_DTYPE_I64:
return CNNL_DTYPE_INT64; return CNNL_DTYPE_INT64;
} else { default:
return CNNL_DTYPE_INVALID; return CNNL_DTYPE_INVALID;
} }
} }
#endif// __COMMON_BANG_H__ template <typename T>
void use_cnnl(std::shared_ptr<Pool<cnnlHandle_t>> &pool, cnrtQueue_t queue,
T const &f) {
auto handle = pool->pop();
if (!handle) {
cnnlCreate(&(*handle));
}
cnnlSetQueue(*handle, (cnrtQueue_t)queue);
f(*handle);
pool->push(std::move(*handle));
}
template <typename T>
void use_cnnl(std::shared_ptr<Pool<cnnlHandle_t>> &pool, T const &f) {
auto handle = pool->pop();
if (!handle) {
cnnlCreate(&(*handle));
}
f(*handle);
pool->push(std::move(*handle));
}
// set cnnl tensor descriptor without strides11
inline void setCnnlTensor(cnnlTensorDescriptor_t desc,
const infiniopTensorDescriptor_t layout) {
std::vector<int> dims(layout->ndim);
for (size_t i = 0; i < layout->ndim; i++) {
dims[i] = static_cast<int>(layout->shape[i]);
}
cnnlSetTensorDescriptor(desc, CNNL_LAYOUT_ARRAY,
cnnlDataTypeConvert(layout->dtype), dims.size(),
dims.data());
}
// set cnnl tensor descriptor with strides
inline void setCnnlTensorEx(cnnlTensorDescriptor_t desc,
const infiniopTensorDescriptor_t layout) {
std::vector<int> dim_size(layout->ndim), dim_stride(layout->ndim);
for (size_t i = 0; i < layout->ndim; i++) {
dim_size[i] = static_cast<int>(layout->shape[i]);
dim_stride[i] = static_cast<int>(layout->strides[i]);
}
cnnlSetTensorDescriptorEx(
desc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(layout->dtype),
dim_size.size(), dim_size.data(), dim_stride.data());
}
#endif // __COMMON_BANG_H__
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
#ifdef ENABLE_CUDA_API #ifdef ENABLE_CUDA_API
#include "./cuda/cuda_handle.h" #include "./cuda/cuda_handle.h"
#endif #endif
#ifdef ENABLE_CAMBRICON_MLU #ifdef ENABLE_CAMBRICON_API
#include "./bang/bang_handle.h" #include "./bang/bang_handle.h"
#endif #endif
#ifdef ENABLE_ASCEND_API #ifdef ENABLE_ASCEND_API
...@@ -32,7 +32,7 @@ __C infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr, infiniDe ...@@ -32,7 +32,7 @@ __C infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr, infiniDe
} }
#endif #endif
#ifdef ENABLE_CAMBRICON_API #ifdef ENABLE_CAMBRICON_API
case DevCambriconMlu: { case INFINI_DEVICE_CAMBRICON: {
return createBangHandle((infiniopBangHandle_t *) handle_ptr, device_id); return createBangHandle((infiniopBangHandle_t *) handle_ptr, device_id);
} }
#endif #endif
...@@ -58,10 +58,9 @@ __C infiniopStatus_t infiniopDestroyHandle(infiniopHandle_t handle) { ...@@ -58,10 +58,9 @@ __C infiniopStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
return deleteCudaHandle((infiniopCudaHandle_t) handle); return deleteCudaHandle((infiniopCudaHandle_t) handle);
} }
#endif #endif
#ifdef ENABLE_CAMBRICON_MLU #ifdef ENABLE_CAMBRICON_API
case DevCambriconMlu: { case INFINI_DEVICE_CAMBRICON: {
delete (infiniopBangHandle_t) handle; return deleteBangHandle((infiniopBangHandle_t) handle);
return STATUS_SUCCESS;
} }
#endif #endif
#ifdef ENABLE_ASCEND_API #ifdef ENABLE_ASCEND_API
......
#include "matmul_cnnl.h" #include "matmul_cnnl.h"
#include "../../../devices/bang/bang_handle.h"
#include "../../../devices/bang/common_bang.h" #include "../../../devices/bang/common_bang.h"
#include "../../utils.h" #include "../../utils.h"
#include "cnrt.h" #include "matmul_cnnl_api.h"
infiniopStatus_t bangCreateMatmulDescriptor(BangHandle_t handle,
MatmulBangDescriptor_t *desc_ptr, infiniopStatus_t bangCreateMatmulDescriptor(
infiniopTensorDescriptor_t c_desc, infiniopBangHandle_t handle, infiniopMatmulBangDescriptor_t *desc_ptr,
float alpha, infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t b_desc) {
infiniopTensorDescriptor_t b_desc, infiniopStatus_t status;
float beta) { auto info = MatmulInfo(c_desc, a_desc, b_desc, &status, false);
infiniopStatus_t *status = new infiniopStatus_t{STATUS_EXECUTION_FAILED}; if (status != INFINIOP_STATUS_SUCCESS) {
auto info = MatmulInfo(c_desc, a_desc, b_desc, status, false); return status;
if (*status != STATUS_SUCCESS) {
return *status;
} }
cnnlTensorDescriptor_t aDesc, bDesc, cDesc; cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
cnnlCreateTensorDescriptor(&aDesc); cnnlCreateTensorDescriptor(&aDesc);
cnnlCreateTensorDescriptor(&bDesc); cnnlCreateTensorDescriptor(&bDesc);
cnnlCreateTensorDescriptor(&cDesc); cnnlCreateTensorDescriptor(&cDesc);
setMatrixTensorEx(aDesc, info.a_matrix); setMatrixTensorEx(aDesc, info.a_matrix, a_desc->dtype);
setMatrixTensorEx(bDesc, info.b_matrix); setMatrixTensorEx(bDesc, info.b_matrix, b_desc->dtype);
setMatrixTensorEx(cDesc, info.c_matrix); setMatrixTensorEx(cDesc, info.c_matrix, c_desc->dtype);
cnnlMatMulDescriptor_t opDesc; cnnlMatMulDescriptor_t opDesc;
cnnlMatMulAlgo_t algo; cnnlMatMulAlgo_t algo;
...@@ -33,28 +30,37 @@ infiniopStatus_t bangCreateMatmulDescriptor(BangHandle_t handle, ...@@ -33,28 +30,37 @@ infiniopStatus_t bangCreateMatmulDescriptor(BangHandle_t handle,
int32_t use_stride = true; int32_t use_stride = true;
cnnlSetMatMulDescAttr(opDesc, CNNL_MATMUL_USE_STRIDE, &use_stride, cnnlSetMatMulDescAttr(opDesc, CNNL_MATMUL_USE_STRIDE, &use_stride,
sizeof(int32_t)); sizeof(int32_t));
*desc_ptr = new MatmulBangDescriptor{ int count = 0;
handle->device, use_cnnl(handle->cnnl_handles, [&](cnnlHandle_t _handle) {
handle->device_id, cnnlGetBatchMatMulAlgoHeuristic(_handle, opDesc, aDesc, bDesc, cDesc,
info, NULL, 1, &algoResult, &count);
alpha, });
beta,
c_desc->dt, size_t workspace_size;
handle->cnnl_handles, cnnlGetBatchMatMulHeuristicResult(algoResult, algo, &workspace_size);
aDesc, *desc_ptr = new InfiniopMatmulBangDescriptor{handle->device,
bDesc, handle->device_id,
cDesc, info,
opDesc, c_desc->dtype,
algo, handle->cnnl_handles,
algoResult}; aDesc,
return STATUS_SUCCESS; bDesc,
cDesc,
opDesc,
algo,
algoResult,
workspace_size};
return INFINIOP_STATUS_SUCCESS;
} }
infiniopStatus_t bangGetMatmulWorkspaceSize(MatmulBangDescriptor_t desc, uint64_t *size) { infiniopStatus_t bangGetMatmulWorkspaceSize(infiniopMatmulBangDescriptor_t desc,
*size = 0; size_t *size) {
return STATUS_SUCCESS; *size = desc->workspace_size;
return INFINIOP_STATUS_SUCCESS;
} }
infiniopStatus_t bangDestroyMatmulDescriptor(MatmulBangDescriptor_t desc) { infiniopStatus_t
bangDestroyMatmulDescriptor(infiniopMatmulBangDescriptor_t desc) {
desc->cnnl_handles = nullptr; desc->cnnl_handles = nullptr;
cnnlDestroyTensorDescriptor(desc->aDesc); cnnlDestroyTensorDescriptor(desc->aDesc);
cnnlDestroyTensorDescriptor(desc->bDesc); cnnlDestroyTensorDescriptor(desc->bDesc);
...@@ -63,41 +69,32 @@ infiniopStatus_t bangDestroyMatmulDescriptor(MatmulBangDescriptor_t desc) { ...@@ -63,41 +69,32 @@ infiniopStatus_t bangDestroyMatmulDescriptor(MatmulBangDescriptor_t desc) {
cnnlMatMulAlgoDestroy(desc->algo); cnnlMatMulAlgoDestroy(desc->algo);
cnnlDestroyMatMulHeuristicResult(desc->algoResult); cnnlDestroyMatMulHeuristicResult(desc->algoResult);
delete desc; delete desc;
return STATUS_SUCCESS; return INFINIOP_STATUS_SUCCESS;
} }
void matmul_cnnl_f16(MatmulBangDescriptor_t desc, void *workspace, void *c, float beta, void const *a, void const *b, float alpha, void *stream) { void matmul_cnnl(infiniopMatmulBangDescriptor_t desc, void *workspace, void *c,
float beta, void const *a, void const *b, float alpha,
void *stream) {
auto info = desc->info; auto info = desc->info;
if (info.is_transed) { if (info.is_transed) {
std::swap(a, b); std::swap(a, b);
} }
use_cnnl(desc->cnnl_handles, desc->device_id, (cnrtQueue_t) stream, use_cnnl(desc->cnnl_handles, (cnrtQueue_t)stream, [&](cnnlHandle_t handle) {
[&](cnnlHandle_t handle) { cnnlBatchMatMulBCast_v2(handle, desc->opDesc, desc->algo, &alpha,
int count = 0; desc->aDesc, a, desc->bDesc, b, &beta,
cnnlGetBatchMatMulAlgoHeuristic(handle, desc->opDesc, desc->aDesc, desc->cDesc, c, workspace,
desc->bDesc, desc->cDesc, desc->workspace_size);
NULL, 1, &desc->algoResult, &count); });
size_t wsSize;
cnnlGetBatchMatMulHeuristicResult(desc->algoResult, desc->algo, &wsSize);
cnrtMalloc(&workspace, wsSize);
cnnlBatchMatMulBCast_v2(handle, desc->opDesc, desc->algo,
&alpha, desc->aDesc, a,
desc->bDesc, b,
&beta, desc->cDesc, c,
workspace, wsSize);
});
} }
infiniopStatus_t bangMatmul(MatmulBangDescriptor_t desc, void *workspace, uint64_t workspace_size, void *c, void const *a, void const *b, void *stream) { infiniopStatus_t bangMatmul(infiniopMatmulBangDescriptor_t desc,
if (cnrtSetDevice(desc->device_id) != cnrtSuccess) { void *workspace, size_t workspace_size, void *c,
return STATUS_BAD_DEVICE; void const *a, void const *b, float alpha,
} float beta, void *stream) {
float alpha = desc->alpha; if (desc->dtype == INFINI_DTYPE_F16 || desc->dtype == INFINI_DTYPE_F32) {
float beta = desc->beta; matmul_cnnl(desc, workspace, c, beta, a, b, alpha, stream);
if (dtype_eq(desc->dtype, F16)) {
matmul_cnnl_f16(desc, workspace, c, beta, a, b, alpha, stream);
cnrtQueueSync((cnrtQueue_t)stream); cnrtQueueSync((cnrtQueue_t)stream);
return STATUS_SUCCESS; return INFINIOP_STATUS_SUCCESS;
} }
return STATUS_BAD_TENSOR_DTYPE; return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
} }
#ifndef __CNNL_MATMUL_H__ #ifndef __CNNL_MATMUL_H__
#define __CNNL_MATMUL_H__ #define __CNNL_MATMUL_H__
#include "../../../devices/bang/bang_handle.h" #include "../../../devices/bang/common_bang.h"
#include "../blas.h" #include "../blas.h"
#include "cnnl.h"
#include "cnnl_extra.h" #include "cnnl_extra.h"
#include "operators.h"
struct MatmulBangDescriptor { struct InfiniopMatmulBangDescriptor {
Device device; infiniDevice_t device;
int device_id; int device_id;
MatmulInfo info; MatmulInfo info;
float alpha; infiniDtype_t dtype;
float beta;
DT dtype;
std::shared_ptr<Pool<cnnlHandle_t>> cnnl_handles; std::shared_ptr<Pool<cnnlHandle_t>> cnnl_handles;
cnnlTensorDescriptor_t aDesc; cnnlTensorDescriptor_t aDesc;
cnnlTensorDescriptor_t bDesc; cnnlTensorDescriptor_t bDesc;
...@@ -20,24 +16,12 @@ struct MatmulBangDescriptor { ...@@ -20,24 +16,12 @@ struct MatmulBangDescriptor {
cnnlMatMulDescriptor_t opDesc; cnnlMatMulDescriptor_t opDesc;
cnnlMatMulAlgo_t algo; cnnlMatMulAlgo_t algo;
cnnlMatMulHeuristicResult_t algoResult; cnnlMatMulHeuristicResult_t algoResult;
size_t workspace_size;
}; };
typedef struct MatmulBangDescriptor *MatmulBangDescriptor_t;
infiniopStatus_t bangCreateMatmulDescriptor(BangHandle_t handle, inline void setMatrixTensorEx(cnnlTensorDescriptor_t desc,
MatmulBangDescriptor_t *desc_ptr, const BlasMatrix &matrix, infiniDtype_t dtype,
infiniopTensorDescriptor_t c_desc, bool trans = false) {
float alpha,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc,
float beta);
infiniopStatus_t bangGetMatmulWorkspaceSize(MatmulBangDescriptor_t desc, uint64_t *size);
infiniopStatus_t bangMatmul(MatmulBangDescriptor_t desc, void *workspace, uint64_t workspace_size, void *c, void const *a, void const *b, void *stream);
infiniopStatus_t bangDestroyMatmulDescriptor(MatmulBangDescriptor_t desc);
inline void setMatrixTensorEx(cnnlTensorDescriptor_t desc, const BlasMatrix &matrix, bool trans = false) {
int ndim = matrix.ndim; int ndim = matrix.ndim;
int batch = matrix.batch; int batch = matrix.batch;
int stride = static_cast<int>(matrix.stride); int stride = static_cast<int>(matrix.stride);
...@@ -49,15 +33,16 @@ inline void setMatrixTensorEx(cnnlTensorDescriptor_t desc, const BlasMatrix &mat ...@@ -49,15 +33,16 @@ inline void setMatrixTensorEx(cnnlTensorDescriptor_t desc, const BlasMatrix &mat
if (ndim == 3) { if (ndim == 3) {
std::vector<int> dim_size = {batch, rows, cols}; std::vector<int> dim_size = {batch, rows, cols};
std::vector<int> dim_stride = {stride, row_stride, col_stride}; std::vector<int> dim_stride = {stride, row_stride, col_stride};
cnnlSetTensorDescriptorEx(desc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_HALF, cnnlSetTensorDescriptorEx(desc, CNNL_LAYOUT_ARRAY,
dim_size.size(), dim_size.data(), dim_stride.data()); cnnlDataTypeConvert(dtype), dim_size.size(),
dim_size.data(), dim_stride.data());
} else if (ndim == 2) { } else if (ndim == 2) {
std::vector<int> dim_size = {rows, cols}; std::vector<int> dim_size = {rows, cols};
std::vector<int> dim_stride = {row_stride, col_stride}; std::vector<int> dim_stride = {row_stride, col_stride};
cnnlSetTensorDescriptorEx(desc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_HALF, cnnlSetTensorDescriptorEx(desc, CNNL_LAYOUT_ARRAY,
dim_size.size(), dim_size.data(), dim_stride.data()); cnnlDataTypeConvert(dtype), dim_size.size(),
dim_size.data(), dim_stride.data());
} }
} }
#endif // __CNNL_MATMUL_H__
#endif// __CNNL_MATMUL_H__
#ifndef __CNNL_MATMUL_API_H__
#define __CNNL_MATMUL_API_H__
#include "../../../devices/bang/bang_handle.h"
#include "infiniop/operator.h"
struct InfiniopMatmulBangDescriptor;
typedef struct InfiniopMatmulBangDescriptor *infiniopMatmulBangDescriptor_t;
infiniopStatus_t bangCreateMatmulDescriptor(
infiniopBangHandle_t handle, infiniopMatmulBangDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc);
infiniopStatus_t bangGetMatmulWorkspaceSize(infiniopMatmulBangDescriptor_t desc,
size_t *size);
infiniopStatus_t bangMatmul(infiniopMatmulBangDescriptor_t desc,
void *workspace, size_t workspace_size, void *c,
void const *a, void const *b, float alpha,
float beta, void *stream);
infiniopStatus_t
bangDestroyMatmulDescriptor(infiniopMatmulBangDescriptor_t desc);
#endif
...@@ -3,11 +3,10 @@ ...@@ -3,11 +3,10 @@
#include "../../utils.h" #include "../../utils.h"
#include <cmath> #include <cmath>
infiniopStatus_t cpuCreateMatmulDescriptor(infiniopCpuHandle_t handle, infiniopStatus_t cpuCreateMatmulDescriptor(
MatmulCpuDescriptor_t *desc_ptr, infiniopCpuHandle_t handle, infiniopMatmulCpuDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t b_desc) {
infiniopTensorDescriptor_t b_desc) {
infiniDtype_t dtype = c_desc->dtype; infiniDtype_t dtype = c_desc->dtype;
if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) { if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
...@@ -20,26 +19,27 @@ infiniopStatus_t cpuCreateMatmulDescriptor(infiniopCpuHandle_t handle, ...@@ -20,26 +19,27 @@ infiniopStatus_t cpuCreateMatmulDescriptor(infiniopCpuHandle_t handle,
return status; return status;
} }
*desc_ptr = new MatmulCpuDescriptor{ *desc_ptr = new MatmulCpuDescriptor{INFINI_DEVICE_CPU, dtype, info};
INFINI_DEVICE_CPU,
dtype,
info};
return INFINIOP_STATUS_SUCCESS; return INFINIOP_STATUS_SUCCESS;
} }
infiniopStatus_t cpuGetMatmulWorkspaceSize(MatmulCpuDescriptor_t desc, uint64_t *size) { infiniopStatus_t cpuGetMatmulWorkspaceSize(infiniopMatmulCpuDescriptor_t desc,
uint64_t *size) {
*size = 0; *size = 0;
return INFINIOP_STATUS_SUCCESS; return INFINIOP_STATUS_SUCCESS;
} }
infiniopStatus_t cpuDestroyMatmulDescriptor(MatmulCpuDescriptor_t desc) { infiniopStatus_t
cpuDestroyMatmulDescriptor(infiniopMatmulCpuDescriptor_t desc) {
delete desc; delete desc;
return INFINIOP_STATUS_SUCCESS; return INFINIOP_STATUS_SUCCESS;
} }
template<typename Tdata> template <typename Tdata>
infiniopStatus_t matmul_cpu(MatmulCpuDescriptor_t desc, void *c, float beta, void const *a, void const *b, float alpha) { infiniopStatus_t matmul_cpu(infiniopMatmulCpuDescriptor_t desc, void *c,
float beta, void const *a, void const *b,
float alpha) {
auto info = desc->info; auto info = desc->info;
if (info.is_transed) { if (info.is_transed) {
...@@ -49,11 +49,20 @@ infiniopStatus_t matmul_cpu(MatmulCpuDescriptor_t desc, void *c, float beta, voi ...@@ -49,11 +49,20 @@ infiniopStatus_t matmul_cpu(MatmulCpuDescriptor_t desc, void *c, float beta, voi
for (int i = 0; i < info.batch; ++i) { for (int i = 0; i < info.batch; ++i) {
for (int m_ = 0; m_ < info.m; ++m_) { for (int m_ = 0; m_ < info.m; ++m_) {
for (int n_ = 0; n_ < info.n; ++n_) { for (int n_ = 0; n_ < info.n; ++n_) {
auto c_ = reinterpret_cast<Tdata *>(c) + i * info.c_matrix.stride + m_ * info.c_matrix.row_stride + n_ * info.c_matrix.col_stride; auto c_ = reinterpret_cast<Tdata *>(c) +
i * info.c_matrix.stride +
m_ * info.c_matrix.row_stride +
n_ * info.c_matrix.col_stride;
float sum = 0; float sum = 0;
for (int k_ = 0; k_ < info.k; ++k_) { for (int k_ = 0; k_ < info.k; ++k_) {
auto a_ = reinterpret_cast<Tdata const *>(a) + i * info.a_matrix.stride + m_ * info.a_matrix.row_stride + k_ * info.a_matrix.col_stride; auto a_ = reinterpret_cast<Tdata const *>(a) +
auto b_ = reinterpret_cast<Tdata const *>(b) + i * info.b_matrix.stride + n_ * info.b_matrix.col_stride + k_ * info.b_matrix.row_stride; i * info.a_matrix.stride +
m_ * info.a_matrix.row_stride +
k_ * info.a_matrix.col_stride;
auto b_ = reinterpret_cast<Tdata const *>(b) +
i * info.b_matrix.stride +
n_ * info.b_matrix.col_stride +
k_ * info.b_matrix.row_stride;
if constexpr (std::is_same<Tdata, uint16_t>::value) { if constexpr (std::is_same<Tdata, uint16_t>::value) {
sum += f16_to_f32(*a_) * f16_to_f32(*b_); sum += f16_to_f32(*a_) * f16_to_f32(*b_);
} else { } else {
...@@ -75,14 +84,9 @@ infiniopStatus_t matmul_cpu(MatmulCpuDescriptor_t desc, void *c, float beta, voi ...@@ -75,14 +84,9 @@ infiniopStatus_t matmul_cpu(MatmulCpuDescriptor_t desc, void *c, float beta, voi
return INFINIOP_STATUS_SUCCESS; return INFINIOP_STATUS_SUCCESS;
} }
infiniopStatus_t cpuMatmul(MatmulCpuDescriptor_t desc, infiniopStatus_t cpuMatmul(infiniopMatmulCpuDescriptor_t desc, void *workspace,
void *workspace, uint64_t workspace_size, void *c, void const *a,
uint64_t workspace_size, void const *b, float alpha, float beta) {
void *c,
void const *a,
void const *b,
float alpha,
float beta) {
if (desc->dtype == INFINI_DTYPE_F16) { if (desc->dtype == INFINI_DTYPE_F16) {
return matmul_cpu<uint16_t>(desc, c, beta, a, b, alpha); return matmul_cpu<uint16_t>(desc, c, beta, a, b, alpha);
} }
......
...@@ -11,25 +11,20 @@ typedef struct MatmulCpuDescriptor { ...@@ -11,25 +11,20 @@ typedef struct MatmulCpuDescriptor {
MatmulInfo info; MatmulInfo info;
} MatmulCpuDescriptor; } MatmulCpuDescriptor;
typedef struct MatmulCpuDescriptor *MatmulCpuDescriptor_t; typedef struct MatmulCpuDescriptor *infiniopMatmulCpuDescriptor_t;
infiniopStatus_t cpuCreateMatmulDescriptor(infiniopCpuHandle_t handle, infiniopStatus_t cpuCreateMatmulDescriptor(
MatmulCpuDescriptor_t *desc_ptr, infiniopCpuHandle_t handle, infiniopMatmulCpuDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t b_desc);
infiniopTensorDescriptor_t b_desc);
infiniopStatus_t cpuGetMatmulWorkspaceSize(MatmulCpuDescriptor_t desc, uint64_t *size); infiniopStatus_t cpuGetMatmulWorkspaceSize(infiniopMatmulCpuDescriptor_t desc,
uint64_t *size);
infiniopStatus_t cpuMatmul(MatmulCpuDescriptor_t desc, infiniopStatus_t cpuMatmul(infiniopMatmulCpuDescriptor_t desc, void *workspace,
void *workspace, uint64_t workspace_size, void *c, void const *a,
uint64_t workspace_size, void const *b, float alpha, float beta);
void *c,
void const *a,
void const *b,
float alpha,
float beta);
infiniopStatus_t cpuDestroyMatmulDescriptor(MatmulCpuDescriptor_t desc); infiniopStatus_t cpuDestroyMatmulDescriptor(infiniopMatmulCpuDescriptor_t desc);
#endif// __INFINIOP_MATMUL_CPU_H__ #endif // __INFINIOP_MATMUL_CPU_H__
...@@ -7,39 +7,44 @@ ...@@ -7,39 +7,44 @@
#ifdef ENABLE_CUDA_API #ifdef ENABLE_CUDA_API
#include "cuda/matmul_cuda_api.h" #include "cuda/matmul_cuda_api.h"
#endif #endif
#ifdef ENABLE_CAMBRICON_MLU #ifdef ENABLE_CAMBRICON_API
#include "bang/matmul_cnnl.h" #include "bang/matmul_cnnl_api.h"
#endif #endif
#ifdef ENABLE_ASCEND_API #ifdef ENABLE_ASCEND_API
#include "ascend/matmul_aclnn_api.h" #include "ascend/matmul_aclnn_api.h"
#endif #endif
__C infiniopStatus_t infiniopCreateMatmulDescriptor(infiniopHandle_t handle, __C infiniopStatus_t infiniopCreateMatmulDescriptor(
infiniopMatmulDescriptor_t *desc_ptr, infiniopHandle_t handle, infiniopMatmulDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t b_desc) {
infiniopTensorDescriptor_t b_desc) {
switch (handle->device) { switch (handle->device) {
#ifdef ENABLE_CPU_API #ifdef ENABLE_CPU_API
case INFINI_DEVICE_CPU: case INFINI_DEVICE_CPU:
return cpuCreateMatmulDescriptor((infiniopCpuHandle_t) handle, (MatmulCpuDescriptor_t *) desc_ptr, c_desc, a_desc, b_desc); return cpuCreateMatmulDescriptor(
(infiniopCpuHandle_t)handle,
(infiniopMatmulCpuDescriptor_t *)desc_ptr, c_desc, a_desc, b_desc);
#endif #endif
#ifdef ENABLE_CUDA_API #ifdef ENABLE_CUDA_API
case INFINI_DEVICE_NVIDIA: { case INFINI_DEVICE_NVIDIA: {
return cudaCreateMatmulDescriptor((infiniopCudaHandle_t) handle, (infiniopMatmulCudaDescriptor_t *) desc_ptr, c_desc, a_desc, b_desc); return cudaCreateMatmulDescriptor(
} (infiniopCudaHandle_t)handle,
(infiniopMatmulCudaDescriptor_t *)desc_ptr, c_desc, a_desc, b_desc);
}
#endif #endif
#ifdef ENABLE_CAMBRICON_MLU #ifdef ENABLE_CAMBRICON_API
case DevCambriconMlu: { case INFINI_DEVICE_CAMBRICON: {
return bangCreateMatmulDescriptor((BangHandle_t) handle, (MatmulBangDescriptor_t *) desc_ptr, c_desc, a_desc, b_desc); return bangCreateMatmulDescriptor(
} (infiniopBangHandle_t)handle,
(infiniopMatmulBangDescriptor_t *)desc_ptr, c_desc, a_desc, b_desc);
}
#endif #endif
#ifdef ENABLE_ASCEND_API #ifdef ENABLE_ASCEND_API
case INFINI_DEVICE_ASCEND: { case INFINI_DEVICE_ASCEND: {
return aclnnCreateMatmulDescriptor( return aclnnCreateMatmulDescriptor((infiniopAscendHandle_t)handle,
(infiniopAscendHandle_t)handle, (MatmulAclnnDescriptor_t *)desc_ptr,
(MatmulAclnnDescriptor_t *)desc_ptr, c_desc, a_desc, b_desc, 1); c_desc, a_desc, b_desc, 1);
} }
#endif #endif
} }
return INFINIOP_STATUS_BAD_DEVICE; return INFINIOP_STATUS_BAD_DEVICE;
...@@ -49,25 +54,27 @@ __C infiniopStatus_t ...@@ -49,25 +54,27 @@ __C infiniopStatus_t
infiniopGetMatmulWorkspaceSize(infiniopMatmulDescriptor_t desc, size_t *size) { infiniopGetMatmulWorkspaceSize(infiniopMatmulDescriptor_t desc, size_t *size) {
switch (desc->device) { switch (desc->device) {
#ifdef ENABLE_CPU_API #ifdef ENABLE_CPU_API
case INFINI_DEVICE_CPU: case INFINI_DEVICE_CPU:
return cpuGetMatmulWorkspaceSize((MatmulCpuDescriptor_t) desc, size); return cpuGetMatmulWorkspaceSize((infiniopMatmulCpuDescriptor_t)desc,
size);
#endif #endif
#ifdef ENABLE_CUDA_API #ifdef ENABLE_CUDA_API
case INFINI_DEVICE_NVIDIA: { case INFINI_DEVICE_NVIDIA: {
return cudaGetMatmulWorkspaceSize((infiniopMatmulCudaDescriptor_t) desc, size); return cudaGetMatmulWorkspaceSize((infiniopMatmulCudaDescriptor_t)desc,
} size);
}
#endif #endif
#ifdef ENABLE_CAMBRICON_MLU #ifdef ENABLE_CAMBRICON_API
case DevCambriconMlu: { case INFINI_DEVICE_CAMBRICON: {
return bangGetMatmulWorkspaceSize((MatmulBangDescriptor_t) desc, size); return bangGetMatmulWorkspaceSize((infiniopMatmulBangDescriptor_t)desc,
} size);
}
#endif #endif
#ifdef ENABLE_ASCEND_API #ifdef ENABLE_ASCEND_API
case INFINI_DEVICE_ASCEND: { case INFINI_DEVICE_ASCEND: {
return aclnnGetMatmulWorkspaceSize((MatmulAclnnDescriptor_t) desc, return aclnnGetMatmulWorkspaceSize((MatmulAclnnDescriptor_t)desc, size);
size); }
}
#endif #endif
} }
return INFINIOP_STATUS_BAD_DEVICE; return INFINIOP_STATUS_BAD_DEVICE;
...@@ -79,49 +86,53 @@ __C infiniopStatus_t infiniopMatmul(infiniopMatmulDescriptor_t desc, ...@@ -79,49 +86,53 @@ __C infiniopStatus_t infiniopMatmul(infiniopMatmulDescriptor_t desc,
float alpha, float beta, void *stream) { float alpha, float beta, void *stream) {
switch (desc->device) { switch (desc->device) {
#ifdef ENABLE_CPU_API #ifdef ENABLE_CPU_API
case INFINI_DEVICE_CPU: case INFINI_DEVICE_CPU:
return cpuMatmul((MatmulCpuDescriptor_t) desc, workspace, workspace_size, c, a, b, alpha, beta); return cpuMatmul((infiniopMatmulCpuDescriptor_t)desc, workspace,
workspace_size, c, a, b, alpha, beta);
#endif #endif
#ifdef ENABLE_CUDA_API #ifdef ENABLE_CUDA_API
case INFINI_DEVICE_NVIDIA: case INFINI_DEVICE_NVIDIA:
return cudaMatmul((infiniopMatmulCudaDescriptor_t) desc, workspace, workspace_size, c, a, b, alpha, beta, stream); return cudaMatmul((infiniopMatmulCudaDescriptor_t)desc, workspace,
#endif workspace_size, c, a, b, alpha, beta, stream);
#ifdef ENABLE_CAMBRICON_MLU #endif
case DevCambriconMlu: { #ifdef ENABLE_CAMBRICON_API
return bangMatmul((MatmulBangDescriptor_t)desc, workspace, case INFINI_DEVICE_CAMBRICON: {
workspace_size, c, a, b, alpha, beta, stream); return bangMatmul((infiniopMatmulBangDescriptor_t)desc, workspace,
} workspace_size, c, a, b, alpha, beta, stream);
}
#endif #endif
#ifdef ENABLE_ASCEND_API #ifdef ENABLE_ASCEND_API
case INFINI_DEVICE_ASCEND: case INFINI_DEVICE_ASCEND:
return aclnnMatmul((MatmulAclnnDescriptor_t)desc, workspace, return aclnnMatmul((MatmulAclnnDescriptor_t)desc, workspace,
workspace_size, c, a, b, alpha, beta, stream); workspace_size, c, a, b, alpha, beta, stream);
#endif #endif
} }
return INFINIOP_STATUS_BAD_DEVICE; return INFINIOP_STATUS_BAD_DEVICE;
} }
__C infiniopStatus_t infiniopDestroyMatmulDescriptor(infiniopMatmulDescriptor_t desc) { __C infiniopStatus_t
infiniopDestroyMatmulDescriptor(infiniopMatmulDescriptor_t desc) {
switch (desc->device) { switch (desc->device) {
#ifdef ENABLE_CPU_API #ifdef ENABLE_CPU_API
case INFINI_DEVICE_CPU: case INFINI_DEVICE_CPU:
return cpuDestroyMatmulDescriptor((MatmulCpuDescriptor_t) desc); return cpuDestroyMatmulDescriptor((infiniopMatmulCpuDescriptor_t)desc);
#endif #endif
#ifdef ENABLE_CUDA_API #ifdef ENABLE_CUDA_API
case INFINI_DEVICE_NVIDIA: { case INFINI_DEVICE_NVIDIA: {
return cudaDestroyMatmulDescriptor((infiniopMatmulCudaDescriptor_t) desc); return cudaDestroyMatmulDescriptor(
} (infiniopMatmulCudaDescriptor_t)desc);
}
#endif #endif
#ifdef ENABLE_CAMBRICON_MLU #ifdef ENABLE_CAMBRICON_API
case DevCambriconMlu: { case INFINI_DEVICE_CAMBRICON: {
return bangDestroyMatmulDescriptor((MatmulBangDescriptor_t) desc); return bangDestroyMatmulDescriptor((infiniopMatmulBangDescriptor_t)desc);
} }
#endif #endif
#ifdef ENABLE_ASCEND_API #ifdef ENABLE_ASCEND_API
case INFINI_DEVICE_ASCEND: { case INFINI_DEVICE_ASCEND: {
return aclnnDestroyMatmulDescriptor((MatmulAclnnDescriptor_t) desc); return aclnnDestroyMatmulDescriptor((MatmulAclnnDescriptor_t)desc);
} }
#endif #endif
} }
return INFINIOP_STATUS_BAD_DEVICE; return INFINIOP_STATUS_BAD_DEVICE;
......
...@@ -50,6 +50,7 @@ option_end() ...@@ -50,6 +50,7 @@ option_end()
if has_config("cambricon-mlu") then if has_config("cambricon-mlu") then
add_defines("ENABLE_CAMBRICON_API") add_defines("ENABLE_CAMBRICON_API")
includes("xmake/bang.lua")
end end
-- 华为昇腾 -- 华为昇腾
...@@ -124,7 +125,7 @@ target("infiniop") ...@@ -124,7 +125,7 @@ target("infiniop")
end end
if has_config("cambricon-mlu") then if has_config("cambricon-mlu") then
add_deps("cambricon-mlu") add_deps("infiniop-cambricon")
end end
if has_config("ascend-npu") then if has_config("ascend-npu") then
add_deps("infiniop-ascend") add_deps("infiniop-ascend")
...@@ -143,5 +144,4 @@ target("infiniop") ...@@ -143,5 +144,4 @@ target("infiniop")
add_installfiles("include/infiniop/*.h", {prefixdir = "include/infiniop"}) add_installfiles("include/infiniop/*.h", {prefixdir = "include/infiniop"})
add_installfiles("include/infiniop.h", {prefixdir = "include"}) add_installfiles("include/infiniop.h", {prefixdir = "include"})
add_installfiles("include/infinicore.h", {prefixdir = "include"}) add_installfiles("include/infinicore.h", {prefixdir = "include"})
target_end() target_end()
local NEUWARE_HOME = os.getenv("NEUWARE_HOME") or "/usr/local/neuware"
add_includedirs(path.join(NEUWARE_HOME, "include"))
add_linkdirs(path.join(NEUWARE_HOME, "lib64"))
add_linkdirs(path.join(NEUWARE_HOME, "lib"))
add_links("libcnrt.so")
add_links("libcnnl.so")
add_links("libcnnl_extra.so")
add_links("libcnpapi.so")
rule("mlu")
set_extensions(".mlu")
on_load(function (target)
target:add("includedirs", path.join(os.projectdir(), "include"))
end)
on_build_file(function (target, sourcefile)
local objectfile = target:objectfile(sourcefile)
os.mkdir(path.directory(objectfile))
local cc = "cncc"
local includedirs = table.concat(target:get("includedirs"), " ")
local args = {"-c", sourcefile, "-o", objectfile, "--bang-mlu-arch=mtp_592", "-O3", "-fPIC", "-Wall", "-Werror", "-std=c++17", "-pthread"}
for _, includedir in ipairs(target:get("includedirs")) do
table.insert(args, "-I" .. includedir)
end
os.execv(cc, args)
table.insert(target:objectfiles(), objectfile)
end)
rule_end()
local src_dir = path.join(os.projectdir(), "src", "infiniop")
target("infiniop-cambricon")
set_kind("static")
on_install(function (target) end)
set_languages("cxx17")
add_files(src_dir.."/devices/bang/*.cc", src_dir.."/ops/*/bang/*.cc")
local mlu_files = os.files(src_dir .. "/ops/*/bang/*.mlu")
if #mlu_files > 0 then
add_files(mlu_files, {rule = "mlu"})
end
add_cxflags("-lstdc++ -Wall -Werror -fPIC")
target_end()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment