Unverified Commit fd0242ed authored by PanZezhong1725's avatar PanZezhong1725 Committed by GitHub
Browse files

Merge pull request #101 from PanZezhong1725/issue/89/bang

Issue/89/bang Refactor Handle, Runtime, and Matmul Implementation for Bang
parents 92ad2426 39b09a9e
#include "../pool.h"
#include "../../tensor.h"
#include "common_bang.h"
#include "infiniop/tensor_descriptor.h"
#include <memory>
#include <vector>
infiniStatus_t createBangHandle(infiniopBangHandle_t *handle_ptr) {
int device_id = 0;
if (cnrtGetDevice(&device_id) != cnrtSuccess) {
return INFINI_STATUS_DEVICE_NOT_INITIALIZED;
namespace device::bang {
Handle::Handle(infiniDevice_t device, int device_id)
: InfiniopHandle{device, device_id},
_internal(std::make_shared<Handle::Internal>()) {}
auto Handle::internal() const -> const std::shared_ptr<Internal> & {
return _internal;
}
infiniStatus_t Handle::Internal::useCnnl(cnrtQueue_t queue, const Fn<cnnlHandle_t> &f) const {
auto handle = cnnl_handles.pop();
if (!handle) {
CHECK_BANG(cnnlCreate(&(*handle)));
}
CHECK_BANG(cnnlSetQueue(*handle, queue));
CHECK_STATUS(f(*handle));
cnnl_handles.push(std::move(*handle));
return INFINI_STATUS_SUCCESS;
}
auto pool = std::make_shared<Pool<cnnlHandle_t>>();
cnnlHandle_t handle;
cnnlCreate(&handle);
pool->push(std::move(handle));
cnnlDataType_t getCnnlDtype(infiniDtype_t dt) {
switch (dt) {
case INFINI_DTYPE_F32:
return CNNL_DTYPE_FLOAT;
case INFINI_DTYPE_F64:
return CNNL_DTYPE_DOUBLE;
case INFINI_DTYPE_F16:
return CNNL_DTYPE_HALF;
case INFINI_DTYPE_I8:
return CNNL_DTYPE_INT8;
case INFINI_DTYPE_I32:
return CNNL_DTYPE_INT32;
case INFINI_DTYPE_U8:
return CNNL_DTYPE_UINT8;
case INFINI_DTYPE_BF16:
return CNNL_DTYPE_BFLOAT16;
case INFINI_DTYPE_I64:
return CNNL_DTYPE_INT64;
default:
return CNNL_DTYPE_INVALID;
}
}
*handle_ptr = new InfiniopBangHandle{INFINI_DEVICE_CAMBRICON, device_id,
std::move(pool)};
infiniStatus_t setCnnlTensor(cnnlTensorDescriptor_t desc,
const InfiniopTensorDescriptor *layout) {
std::vector<int> dims(layout->ndim());
for (size_t i = 0; i < layout->ndim(); i++) {
dims[i] = static_cast<int>(layout->shape()[i]);
}
CHECK_BANG(cnnlSetTensorDescriptor(desc, CNNL_LAYOUT_ARRAY,
getCnnlDtype(layout->dtype()), dims.size(),
dims.data()));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t setCnnlTensorEx(cnnlTensorDescriptor_t desc,
const InfiniopTensorDescriptor *layout) {
std::vector<int> dim_size(layout->ndim()), dim_stride(layout->ndim());
for (size_t i = 0; i < layout->ndim(); i++) {
dim_size[i] = static_cast<int>(layout->shape()[i]);
dim_stride[i] = static_cast<int>(layout->strides()[i]);
}
CHECK_BANG(cnnlSetTensorDescriptorEx(
desc, CNNL_LAYOUT_ARRAY, getCnnlDtype(layout->dtype()),
dim_size.size(), dim_size.data(), dim_stride.data()));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t destroyBangHandle(infiniopBangHandle_t handle) {
delete handle;
namespace cambricon {
Handle::Handle(int device_id)
: bang::Handle(INFINI_DEVICE_CAMBRICON, device_id) {}
infiniStatus_t Handle::create(InfiniopHandle **handle_ptr, int device_id) {
*handle_ptr = new Handle(device_id);
return INFINI_STATUS_SUCCESS;
}
} // namespace cambricon
} // namespace device::bang
#ifndef BANG_HANDLE_H
#define BANG_HANDLE_H
#ifndef __INFINIOP_BANG_HANDLE_H__
#define __INFINIOP_BANG_HANDLE_H__
#include "../../handle.h"
#include <memory>
struct InfiniopBangHandle;
typedef struct InfiniopBangHandle *infiniopBangHandle_t;
namespace device::bang {
infiniStatus_t createBangHandle(infiniopBangHandle_t *handle_ptr);
infiniStatus_t destroyBangHandle(infiniopBangHandle_t handle);
struct Handle : public InfiniopHandle {
class Internal;
auto internal() const -> const std::shared_ptr<Internal> &;
#endif
protected:
Handle(infiniDevice_t device, int device_id);
private:
std::shared_ptr<Internal> _internal;
};
namespace cambricon {
class Handle : public bang::Handle {
Handle(int device_id);
public:
static infiniStatus_t create(InfiniopHandle **handle_ptr, int device_id);
};
} // namespace cambricon
} // namespace device::bang
#endif // __INFINIOP_BANG_HANDLE_H__
......@@ -6,89 +6,32 @@
#include "bang_handle.h"
#include "cnnl.h"
#include "cnrt.h"
#include "infiniop/tensor_descriptor.h"
#include <memory>
#include <vector>
#include <functional>
// the maximum NRAM memory is 1024 * 768
#define NRAM_MAX_SIZE (1024 * 256)
#define CHECK_BANG(API) CHECK_INTERNAL(API, CNNL_STATUS_SUCCESS)
#define GDRAM_MAX_SIZE (1024 * 1024 * 1024)
namespace device::bang {
struct InfiniopBangHandle {
infiniDevice_t device;
int device_id;
std::shared_ptr<Pool<cnnlHandle_t>> cnnl_handle_pool;
};
class Handle::Internal {
Pool<cnnlHandle_t> cnnl_handles;
inline cnnlDataType_t cnnlDataTypeConvert(infiniDtype_t dataType) {
switch (dataType) {
case INFINI_DTYPE_F32:
return CNNL_DTYPE_FLOAT;
case INFINI_DTYPE_F64:
return CNNL_DTYPE_DOUBLE;
case INFINI_DTYPE_F16:
return CNNL_DTYPE_HALF;
case INFINI_DTYPE_I8:
return CNNL_DTYPE_INT8;
case INFINI_DTYPE_I32:
return CNNL_DTYPE_INT32;
case INFINI_DTYPE_U8:
return CNNL_DTYPE_UINT8;
case INFINI_DTYPE_BF16:
return CNNL_DTYPE_BFLOAT16;
case INFINI_DTYPE_I64:
return CNNL_DTYPE_INT64;
default:
return CNNL_DTYPE_INVALID;
}
}
template <typename T>
using Fn = std::function<infiniStatus_t(T)>;
template <typename T>
void use_cnnl(std::shared_ptr<Pool<cnnlHandle_t>> &pool, cnrtQueue_t queue,
T const &f) {
auto handle = pool->pop();
if (!handle) {
cnnlCreate(&(*handle));
}
cnnlSetQueue(*handle, (cnrtQueue_t)queue);
f(*handle);
pool->push(std::move(*handle));
}
public:
infiniStatus_t useCnnl(cnrtQueue_t queue, const Fn<cnnlHandle_t> &f) const;
};
template <typename T>
void use_cnnl(std::shared_ptr<Pool<cnnlHandle_t>> &pool, T const &f) {
auto handle = pool->pop();
if (!handle) {
cnnlCreate(&(*handle));
}
f(*handle);
pool->push(std::move(*handle));
}
cnnlDataType_t getCnnlDtype(infiniDtype_t dt);
// set cnnl tensor descriptor without strides11
inline void setCnnlTensor(cnnlTensorDescriptor_t desc,
const infiniopTensorDescriptor_t layout) {
std::vector<int> dims(layout->ndim);
for (size_t i = 0; i < layout->ndim; i++) {
dims[i] = static_cast<int>(layout->shape[i]);
}
cnnlSetTensorDescriptor(desc, CNNL_LAYOUT_ARRAY,
cnnlDataTypeConvert(layout->dtype), dims.size(),
dims.data());
}
// set cnnl tensor descriptor without strides
infiniStatus_t setCnnlTensor(cnnlTensorDescriptor_t desc,
const InfiniopTensorDescriptor *layout);
// set cnnl tensor descriptor with strides
inline void setCnnlTensorEx(cnnlTensorDescriptor_t desc,
const infiniopTensorDescriptor_t layout) {
std::vector<int> dim_size(layout->ndim), dim_stride(layout->ndim);
for (size_t i = 0; i < layout->ndim; i++) {
dim_size[i] = static_cast<int>(layout->shape[i]);
dim_stride[i] = static_cast<int>(layout->strides[i]);
}
cnnlSetTensorDescriptorEx(
desc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(layout->dtype),
dim_size.size(), dim_size.data(), dim_stride.data());
}
infiniStatus_t setCnnlTensorEx(cnnlTensorDescriptor_t desc,
const InfiniopTensorDescriptor *layout);
} // namespace device::bang
#endif // __COMMON_BANG_H__
......@@ -39,9 +39,7 @@ __C infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr) {
CREATE(INFINI_DEVICE_NVIDIA, cuda::nvidia);
#endif
#ifdef ENABLE_CAMBRICON_API
case INFINI_DEVICE_CAMBRICON: {
return createBangHandle((infiniopBangHandle_t *)handle_ptr);
}
CREATE(INFINI_DEVICE_CAMBRICON, bang::cambricon);
#endif
#ifdef ENABLE_ASCEND_API
case INFINI_DEVICE_ASCEND: {
......@@ -76,9 +74,7 @@ __C infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
DELETE(INFINI_DEVICE_NVIDIA, cuda::nvidia);
#endif
#ifdef ENABLE_CAMBRICON_API
case INFINI_DEVICE_CAMBRICON: {
return destroyBangHandle((infiniopBangHandle_t)handle);
}
DELETE(INFINI_DEVICE_CAMBRICON, bang::cambricon);
#endif
#ifdef ENABLE_ASCEND_API
case INFINI_DEVICE_ASCEND: {
......
#include "matmul_bang.h"
#include "../../../devices/bang/bang_handle.h"
#include "../../../devices/bang/common_bang.h"
#include <cnnl_extra.h>
......@@ -10,7 +9,7 @@ struct Descriptor::Opaque {
cnnlMatMulAlgo_t algo;
cnnlMatMulHeuristicResult_t algoResult;
cnnlTensorDescriptor_t a, b, c;
std::shared_ptr<Pool<cnnlHandle_t>> cnnl_handle_pool;
std::shared_ptr<device::bang::Handle::Internal> internal;
~Opaque() {
cnnlDestroyTensorDescriptor(a);
......@@ -22,7 +21,7 @@ struct Descriptor::Opaque {
}
};
static void setMatrixTensorEx(
static infiniStatus_t setMatrixTensorEx(
cnnlTensorDescriptor_t desc,
const BlasMatrix &matrix, infiniDtype_t dtype,
bool trans = false) {
......@@ -38,20 +37,21 @@ static void setMatrixTensorEx(
case 3: {
std::vector<int> dim_size = {batch, rows, cols};
std::vector<int> dim_stride = {stride, row_stride, col_stride};
cnnlSetTensorDescriptorEx(
CHECK_BANG(cnnlSetTensorDescriptorEx(
desc, CNNL_LAYOUT_ARRAY,
cnnlDataTypeConvert(dtype), dim_size.size(),
dim_size.data(), dim_stride.data());
device::bang::getCnnlDtype(dtype), dim_size.size(),
dim_size.data(), dim_stride.data()));
} break;
case 2: {
std::vector<int> dim_size = {rows, cols};
std::vector<int> dim_stride = {row_stride, col_stride};
cnnlSetTensorDescriptorEx(
CHECK_BANG(cnnlSetTensorDescriptorEx(
desc, CNNL_LAYOUT_ARRAY,
cnnlDataTypeConvert(dtype), dim_size.size(),
dim_size.data(), dim_stride.data());
device::bang::getCnnlDtype(dtype), dim_size.size(),
dim_size.data(), dim_stride.data()));
} break;
}
return INFINI_STATUS_SUCCESS;
}
Descriptor::~Descriptor() {
......@@ -64,8 +64,8 @@ infiniStatus_t Descriptor::create(
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc) {
auto handle = reinterpret_cast<infiniopBangHandle_t>(handle_);
auto dtype = c_desc->dtype;
auto handle = reinterpret_cast<device::bang::cambricon::Handle *>(handle_);
auto dtype = c_desc->dtype();
if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
......@@ -78,48 +78,47 @@ infiniStatus_t Descriptor::create(
}
cnnlTensorDescriptor_t a, b, c;
cnnlCreateTensorDescriptor(&a);
cnnlCreateTensorDescriptor(&b);
cnnlCreateTensorDescriptor(&c);
CHECK_BANG(cnnlCreateTensorDescriptor(&a));
CHECK_BANG(cnnlCreateTensorDescriptor(&b));
CHECK_BANG(cnnlCreateTensorDescriptor(&c));
setMatrixTensorEx(a, info.a_matrix, a_desc->dtype);
setMatrixTensorEx(b, info.b_matrix, b_desc->dtype);
setMatrixTensorEx(c, info.c_matrix, c_desc->dtype);
CHECK_STATUS(setMatrixTensorEx(a, info.a_matrix, a_desc->dtype()));
CHECK_STATUS(setMatrixTensorEx(b, info.b_matrix, b_desc->dtype()));
CHECK_STATUS(setMatrixTensorEx(c, info.c_matrix, c_desc->dtype()));
cnnlMatMulDescriptor_t op;
cnnlMatMulAlgo_t algo;
cnnlMatMulHeuristicResult_t algoResult;
cnnlMatMulDescCreate(&op);
cnnlMatMulAlgoCreate(&algo);
cnnlCreateMatMulHeuristicResult(&algoResult);
CHECK_BANG(cnnlMatMulDescCreate(&op));
CHECK_BANG(cnnlMatMulAlgoCreate(&algo));
CHECK_BANG(cnnlCreateMatMulHeuristicResult(&algoResult));
int32_t use_stride = true;
cnnlSetMatMulDescAttr(
CHECK_BANG(cnnlSetMatMulDescAttr(
op,
CNNL_MATMUL_USE_STRIDE,
&use_stride,
sizeof(int32_t));
sizeof(int32_t)));
int count = 0;
use_cnnl(handle->cnnl_handle_pool,
[&](cnnlHandle_t _handle) {
cnnlGetBatchMatMulAlgoHeuristic(
_handle,
op, a, b, c,
NULL, 1, &algoResult, &count);
});
CHECK_STATUS(
handle->internal()->useCnnl(
(cnrtQueue_t) nullptr,
[&](cnnlHandle_t _handle) {
CHECK_BANG(
cnnlGetBatchMatMulAlgoHeuristic(
_handle,
op, a, b, c,
NULL, 1, &algoResult, &count));
return INFINI_STATUS_SUCCESS;
}));
size_t workspace_size;
cnnlGetBatchMatMulHeuristicResult(algoResult, algo, &workspace_size);
CHECK_BANG(cnnlGetBatchMatMulHeuristicResult(algoResult, algo, &workspace_size));
*desc_ptr = new Descriptor(
dtype, info, workspace_size,
new Opaque{
op,
algo,
algoResult,
a,
b,
c,
handle->cnnl_handle_pool},
op, algo, algoResult, a, b, c, handle->internal()},
handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
......@@ -137,21 +136,22 @@ infiniStatus_t Descriptor::calculate(
if (_info.is_transed) {
std::swap(a, b);
}
use_cnnl(_opaque->cnnl_handle_pool,
(cnrtQueue_t)stream,
[&](cnnlHandle_t handle) {
cnnlBatchMatMulBCast_v2(
handle,
_opaque->op,
_opaque->algo,
&alpha,
_opaque->a, a,
_opaque->b, b,
&beta,
_opaque->c, c,
workspace,
workspace_size);
});
CHECK_STATUS(_opaque->internal->useCnnl(
(cnrtQueue_t)stream,
[&](cnnlHandle_t handle) {
CHECK_BANG(cnnlBatchMatMulBCast_v2(
handle,
_opaque->op,
_opaque->algo,
&alpha,
_opaque->a, a,
_opaque->b, b,
&beta,
_opaque->c, c,
workspace,
workspace_size));
return INFINI_STATUS_SUCCESS;
}));
cnrtQueueSync((cnrtQueue_t)stream);
return INFINI_STATUS_SUCCESS;
......
#include "infinirt_bang.h"
#include "../../utils.h"
#include "cnrt.h"
#define CHECK_BANGRT(RT_API) CHECK_INTERNAL(RT_API, cnrtSuccess)
namespace infinirt::bang {
infiniStatus_t getDeviceCount(int *count) {
CHECK_BANGRT(cnrtGetDeviceCount(count));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t setDevice(int device_id) {
CHECK_BANGRT(cnrtSetDevice(device_id));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t deviceSynchronize() {
CHECK_BANGRT(cnrtSyncDevice());
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t streamCreate(infinirtStream_t *stream_ptr) {
cnrtQueue_t queue;
CHECK_BANGRT(cnrtQueueCreate(&stream));
*stream_ptr = queue;
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t streamDestroy(infinirtStream_t stream) {
CHECK_BANGRT(cnrtQueueDestroy((cnrtQueue_t)stream));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t streamSynchronize(infinirtStream_t stream) {
CHECK_BANGRT(cnrtQueueSync((cnrtQueue_t)stream));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event) {
CHECK_BANGRT(cnrtQueueWaitNotifier((cnrtNotifier_t)event, (cnrtQueue_t)stream, 0));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t eventCreate(infinirtEvent_t *event_ptr) {
cnrtNotifier_t notifier;
CHECK_BANGRT(cnrtNotifierCreate(&notifier));
*event_ptr = notifier;
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t eventRecord(infinirtEvent_t event, infinirtStream_t stream) {
CHECK_BANGRT(cnrtPlaceNotifier((cnrtNotifier_t)event, (cnrtQueue_t)stream));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t eventQuery(infinirtEvent_t event, infinirtEventStatus_t *status_ptr) {
auto status = cnrtQueryNotifier((cnrtQueue_t)stream);
if (status == cnrtSuccess) {
*status_ptr = INFINIRT_EVENT_COMPLETE;
} else if (status == cnrtErrorBusy) {
*status_ptr = INFINIRT_EVENT_NOT_READY;
} else {
CHECK_BANGRT(status);
}
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t eventSynchronize(infinirtEvent_t event) {
CHECK_BANGRT(cnrtWaitNotifier((cnrtNotifier_t)event));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t eventDestroy(infinirtEvent_t event) {
CHECK_BANGRT(cnrtNotifierDestroy((cnrtNotifier_t)event));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t mallocDevice(void **p_ptr, size_t size) {
CHECK_BANGRT(cnrtMalloc(p_ptr, size));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t mallocHost(void **p_ptr, size_t size) {
CHECK_BANGRT(cnrtHostMalloc(p_ptr, size));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t freeDevice(void *ptr) {
CHECK_BANGRT(cnrtFree(ptr));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t freeHost(void *ptr) {
CHECK_BANGRT(cnrtFreeHost(ptr));
return INFINI_STATUS_SUCCESS;
}
cnrtMemTransDir_t toBangMemcpyKind(infinirtMemcpyKind_t kind) {
switch (kind) {
case INFINIRT_MEMCPY_H2D:
return cnrtMemcpyHostToDev;
case INFINIRT_MEMCPY_D2H:
return cnrtMemcpyDevToHost;
case INFINIRT_MEMCPY_D2D:
return cnrtMemcpyDevToDev;
case INFINIRT_MEMCPY_H2H:
return cnrtMemcpyHostToHost;
default:
return cnrtMemcpyNoDirection;
}
}
infiniStatus_t memcpy(void *dst, const void *src, size_t size, infinirtMemcpyKind_t kind) {
CHECK_BANGRT(cnrtMemcpy(dst, src, size, toBangMemcpyKind(kind)));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t memcpyAsync(void *dst, const void *src, size_t size, infinirtMemcpyKind_t kind, infinirtStream_t stream) {
CHECK_BANGRT(cnrtMemcpyAsync_V2(dst, src, size, (cnrtQueue_t)stream, toBangMemcpyKind(kind)));
return INFINI_STATUS_SUCCESS;
}
// Does not support async malloc. Use blocking-style malloc instead
infiniStatus_t mallocAsync(void **p_ptr, size_t size, infinirtStream_t stream) {
CHECK_BANGRT(cnrtMalloc(p_ptr, size));
return INFINI_STATUS_SUCCESS;
}
// Does not support async free. Use blocking-style free instead
infiniStatus_t freeAsync(void *ptr, infinirtStream_t stream) {
CHECK_BANGRT(cnrtFree(ptr));
return INFINI_STATUS_SUCCESS;
}
} // namespace infinirt::bang
#ifndef __INFINIRT_BANG_H__
#define __INFINIRT_BANG_H__
#include "../infinirt_impl.h"
namespace infinirt::bang {
#ifdef ENABLE_BANG_API
INFINIRT_DEVICE_API_IMPL
#else
INFINIRT_DEVICE_API_NOOP
#endif
} // namespace infinirt::bang
#endif // __INFINIRT_BANG_H__
#include "infinirt.h"
#include "../utils.h"
#include "ascend/infinirt_ascend.h"
#include "bang/infinirt_bang.h"
#include "cpu/infinirt_cpu.h"
#include "cuda/infinirt_cuda.cuh"
......@@ -51,6 +52,9 @@ __C infiniStatus_t infinirtGetDevice(infiniDevice_t *device_ptr, int *device_id_
case INFINI_DEVICE_NVIDIA: \
_status = infinirt::cuda::API PARAMS; \
break; \
case INFINI_DEVICE_CAMBRICON: \
_status = infinirt::bang::API PARAMS; \
break; \
case INFINI_DEVICE_ASCEND: \
_status = infinirt::ascend::API PARAMS; \
break; \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment