Commit e9ce6db5 authored by Zimin Li's avatar Zimin Li
Browse files

issue/89 Refactor handle and rt for bang and reflect related changes in matmul bang implementation

parent 35ad7d1e
#ifndef __INFINIOP_BANG_INTERNAL_H__
#define __INFINIOP_BANG_INTERNAL_H__
#include "../pool.h"
#include "bang_handle.h"
#include "cnnl.h"
#include "cnrt.h"
#include <functional>
namespace device::bang {
class Handle::Internal {
Pool<cnnlHandle_t> cnnl_handles;
public:
infiniStatus_t use_cnnl(cnrtQueue_t queue, const std::function<void(cnnlHandle_t)> &f) const;
};
cnnlDataType_t getCnnlDtype(infiniDtype_t dt);
} // namespace device::bang
#endif // __INFINIOP_BANG_INTERNAL_H__
#include "../pool.h"
#include "common_bang.h"
#include "cnnl.h"
#include "infiniop/tensor_descriptor.h"
#include <memory>
#include <vector>
#include "../../tensor.h"
#include "_internal.h"
#include "common_bang.h"
namespace device::bang {
Handle::Handle(infiniDevice_t device, int device_id)
: InfiniopHandle{device, device_id},
_internal(std::make_shared<Handle::Internal>()) {}
auto Handle::internal() const -> const std::shared_ptr<Internal> & {
return _internal;
}
template <typename T>
using Fn = std::function<void(T)>;
infiniStatus_t createBangHandle(infiniopBangHandle_t *handle_ptr) {
int device_id = 0;
if (cnrtGetDevice(&device_id) != cnrtSuccess) {
return INFINI_STATUS_DEVICE_NOT_INITIALIZED;
infiniStatus_t Handle::Internal::use_cnnl(cnrtQueue_t queue, const std::function<void(cnnlHandle_t)> &f) const {
auto handle = cnnl_handles.pop();
if (!handle) {
cnnlCreate(&(*handle));
}
CHECK_BANG(cnnlSetQueue(*handle, queue));
f(*handle);
cnnl_handles.push(std::move(*handle));
return INFINI_STATUS_SUCCESS;
}
auto pool = std::make_shared<Pool<cnnlHandle_t>>();
cnnlHandle_t handle;
cnnlCreate(&handle);
pool->push(std::move(handle));
cnnlDataType_t getCnnlDtype(infiniDtype_t dt) {
switch (dt) {
case INFINI_DTYPE_F32:
return CNNL_DTYPE_FLOAT;
case INFINI_DTYPE_F64:
return CNNL_DTYPE_DOUBLE;
case INFINI_DTYPE_F16:
return CNNL_DTYPE_HALF;
case INFINI_DTYPE_I8:
return CNNL_DTYPE_INT8;
case INFINI_DTYPE_I32:
return CNNL_DTYPE_INT32;
case INFINI_DTYPE_U8:
return CNNL_DTYPE_UINT8;
case INFINI_DTYPE_BF16:
return CNNL_DTYPE_BFLOAT16;
case INFINI_DTYPE_I64:
return CNNL_DTYPE_INT64;
default:
return CNNL_DTYPE_INVALID;
}
}
*handle_ptr = new InfiniopBangHandle{INFINI_DEVICE_CAMBRICON, device_id,
std::move(pool)};
// set cnnl tensor descriptor without strides11
inline infiniStatus_t setCnnlTensor(cnnlTensorDescriptor_t desc,
const InfiniopTensorDescriptor* layout) {
std::vector<int> dims(layout->ndim());
for (size_t i = 0; i < layout->ndim(); i++) {
dims[i] = static_cast<int>(layout->shape()[i]);
}
CHECK_BANG(cnnlSetTensorDescriptor(desc, CNNL_LAYOUT_ARRAY,
getCnnlDtype(layout->dtype()), dims.size(),
dims.data()));
return INFINI_STATUS_SUCCESS;
}
// set cnnl tensor descriptor with strides
inline infiniStatus_t setCnnlTensorEx(cnnlTensorDescriptor_t desc,
const InfiniopTensorDescriptor* layout) {
std::vector<int> dim_size(layout->ndim()), dim_stride(layout->ndim());
for (size_t i = 0; i < layout->ndim(); i++) {
dim_size[i] = static_cast<int>(layout->shape()[i]);
dim_stride[i] = static_cast<int>(layout->strides()[i]);
}
CHECK_BANG(cnnlSetTensorDescriptorEx(
desc, CNNL_LAYOUT_ARRAY, getCnnlDtype(layout->dtype()),
dim_size.size(), dim_size.data(), dim_stride.data()));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t destroyBangHandle(infiniopBangHandle_t handle) {
delete handle;
namespace cambricon {
Handle::Handle(int device_id)
: bang::Handle(INFINI_DEVICE_CAMBRICON, device_id) {}
infiniStatus_t Handle::create(InfiniopHandle **handle_ptr, int device_id) {
*handle_ptr = new Handle(device_id);
return INFINI_STATUS_SUCCESS;
}
} // namespace cambricon
} // namespace device::bang
#ifndef BANG_HANDLE_H
#define BANG_HANDLE_H
#ifndef __INFINIOP_BANG_HANDLE_H__
#define __INFINIOP_BANG_HANDLE_H__
#include "../../handle.h"
#include <memory>
struct InfiniopBangHandle;
typedef struct InfiniopBangHandle *infiniopBangHandle_t;
namespace device::bang {
infiniStatus_t createBangHandle(infiniopBangHandle_t *handle_ptr);
infiniStatus_t destroyBangHandle(infiniopBangHandle_t handle);
struct Handle : public InfiniopHandle {
class Internal;
auto internal() const -> const std::shared_ptr<Internal> &;
#endif
protected:
Handle(infiniDevice_t device, int device_id);
private:
std::shared_ptr<Internal> _internal;
};
namespace cambricon {
class Handle : public bang::Handle {
Handle(int device_id);
public:
static infiniStatus_t create(InfiniopHandle **handle_ptr, int device_id);
};
} // namespace cambricon
} // namespace device::bang
#endif // __INFINIOP_BANG_HANDLE_H__
......@@ -2,93 +2,18 @@
#define __COMMON_BANG_H__
#include "../../../utils.h"
#include "../pool.h"
#include "bang_handle.h"
#include "cnnl.h"
#include "cnrt.h"
#include "infiniop/tensor_descriptor.h"
#include <memory>
#include <vector>
// the maximum NRAM memory is 1024 * 768
#define NRAM_MAX_SIZE (1024 * 256)
#define GDRAM_MAX_SIZE (1024 * 1024 * 1024)
struct InfiniopBangHandle {
infiniDevice_t device;
int device_id;
std::shared_ptr<Pool<cnnlHandle_t>> cnnl_handle_pool;
#ifdef __cplusplus
extern "C" {
#endif
#define CHECK_BANG(API) CHECK_INTERNAL(API, CNNL_STATUS_SUCCESS)
#ifdef __cplusplus
};
inline cnnlDataType_t cnnlDataTypeConvert(infiniDtype_t dataType) {
switch (dataType) {
case INFINI_DTYPE_F32:
return CNNL_DTYPE_FLOAT;
case INFINI_DTYPE_F64:
return CNNL_DTYPE_DOUBLE;
case INFINI_DTYPE_F16:
return CNNL_DTYPE_HALF;
case INFINI_DTYPE_I8:
return CNNL_DTYPE_INT8;
case INFINI_DTYPE_I32:
return CNNL_DTYPE_INT32;
case INFINI_DTYPE_U8:
return CNNL_DTYPE_UINT8;
case INFINI_DTYPE_BF16:
return CNNL_DTYPE_BFLOAT16;
case INFINI_DTYPE_I64:
return CNNL_DTYPE_INT64;
default:
return CNNL_DTYPE_INVALID;
}
}
template <typename T>
void use_cnnl(std::shared_ptr<Pool<cnnlHandle_t>> &pool, cnrtQueue_t queue,
T const &f) {
auto handle = pool->pop();
if (!handle) {
cnnlCreate(&(*handle));
}
cnnlSetQueue(*handle, (cnrtQueue_t)queue);
f(*handle);
pool->push(std::move(*handle));
}
template <typename T>
void use_cnnl(std::shared_ptr<Pool<cnnlHandle_t>> &pool, T const &f) {
auto handle = pool->pop();
if (!handle) {
cnnlCreate(&(*handle));
}
f(*handle);
pool->push(std::move(*handle));
}
// set cnnl tensor descriptor without strides11
inline void setCnnlTensor(cnnlTensorDescriptor_t desc,
const infiniopTensorDescriptor_t layout) {
std::vector<int> dims(layout->ndim);
for (size_t i = 0; i < layout->ndim; i++) {
dims[i] = static_cast<int>(layout->shape[i]);
}
cnnlSetTensorDescriptor(desc, CNNL_LAYOUT_ARRAY,
cnnlDataTypeConvert(layout->dtype), dims.size(),
dims.data());
}
// set cnnl tensor descriptor with strides
inline void setCnnlTensorEx(cnnlTensorDescriptor_t desc,
const infiniopTensorDescriptor_t layout) {
std::vector<int> dim_size(layout->ndim), dim_stride(layout->ndim);
for (size_t i = 0; i < layout->ndim; i++) {
dim_size[i] = static_cast<int>(layout->shape[i]);
dim_stride[i] = static_cast<int>(layout->strides[i]);
}
cnnlSetTensorDescriptorEx(
desc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(layout->dtype),
dim_size.size(), dim_size.data(), dim_stride.data());
}
#endif
#endif // __COMMON_BANG_H__
......@@ -39,9 +39,7 @@ __C infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr) {
CREATE(INFINI_DEVICE_NVIDIA, cuda::nvidia);
#endif
#ifdef ENABLE_CAMBRICON_API
case INFINI_DEVICE_CAMBRICON: {
return createBangHandle((infiniopBangHandle_t *)handle_ptr);
}
CREATE(INFINI_DEVICE_CAMBRICON, bang::cambricon);
#endif
#ifdef ENABLE_ASCEND_API
case INFINI_DEVICE_ASCEND: {
......@@ -76,9 +74,7 @@ __C infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
DELETE(INFINI_DEVICE_NVIDIA, cuda::nvidia);
#endif
#ifdef ENABLE_CAMBRICON_API
case INFINI_DEVICE_CAMBRICON: {
return destroyBangHandle((infiniopBangHandle_t)handle);
}
DELETE(INFINI_DEVICE_CAMBRICON, bang::cambricon);
#endif
#ifdef ENABLE_ASCEND_API
case INFINI_DEVICE_ASCEND: {
......
#include "matmul_bang.h"
#include "../../../devices/bang/bang_handle.h"
#include "../../../devices/bang/common_bang.h"
#include "../../../devices/bang/_internal.h"
#include <cnnl_extra.h>
namespace op::matmul::bang {
......@@ -10,7 +10,7 @@ struct Descriptor::Opaque {
cnnlMatMulAlgo_t algo;
cnnlMatMulHeuristicResult_t algoResult;
cnnlTensorDescriptor_t a, b, c;
std::shared_ptr<Pool<cnnlHandle_t>> cnnl_handle_pool;
std::shared_ptr<device::bang::Handle::Internal> internal;
~Opaque() {
cnnlDestroyTensorDescriptor(a);
......@@ -40,7 +40,7 @@ static void setMatrixTensorEx(
std::vector<int> dim_stride = {stride, row_stride, col_stride};
cnnlSetTensorDescriptorEx(
desc, CNNL_LAYOUT_ARRAY,
cnnlDataTypeConvert(dtype), dim_size.size(),
device::bang::getCnnlDtype(dtype), dim_size.size(),
dim_size.data(), dim_stride.data());
} break;
case 2: {
......@@ -48,7 +48,7 @@ static void setMatrixTensorEx(
std::vector<int> dim_stride = {row_stride, col_stride};
cnnlSetTensorDescriptorEx(
desc, CNNL_LAYOUT_ARRAY,
cnnlDataTypeConvert(dtype), dim_size.size(),
device::bang::getCnnlDtype(dtype), dim_size.size(),
dim_size.data(), dim_stride.data());
} break;
}
......@@ -64,8 +64,8 @@ infiniStatus_t Descriptor::create(
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc) {
auto handle = reinterpret_cast<infiniopBangHandle_t>(handle_);
auto dtype = c_desc->dtype;
auto handle = reinterpret_cast<device::bang::cambricon::Handle *>(handle_);
auto dtype = c_desc->dtype();
if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
......@@ -82,9 +82,9 @@ infiniStatus_t Descriptor::create(
cnnlCreateTensorDescriptor(&b);
cnnlCreateTensorDescriptor(&c);
setMatrixTensorEx(a, info.a_matrix, a_desc->dtype);
setMatrixTensorEx(b, info.b_matrix, b_desc->dtype);
setMatrixTensorEx(c, info.c_matrix, c_desc->dtype);
setMatrixTensorEx(a, info.a_matrix, a_desc->dtype());
setMatrixTensorEx(b, info.b_matrix, b_desc->dtype());
setMatrixTensorEx(c, info.c_matrix, c_desc->dtype());
cnnlMatMulDescriptor_t op;
cnnlMatMulAlgo_t algo;
......@@ -99,7 +99,8 @@ infiniStatus_t Descriptor::create(
&use_stride,
sizeof(int32_t));
int count = 0;
use_cnnl(handle->cnnl_handle_pool,
handle->internal()->use_cnnl((cnrtQueue_t)nullptr,
[&](cnnlHandle_t _handle) {
cnnlGetBatchMatMulAlgoHeuristic(
_handle,
......@@ -113,13 +114,8 @@ infiniStatus_t Descriptor::create(
*desc_ptr = new Descriptor(
dtype, info, workspace_size,
new Opaque{
op,
algo,
algoResult,
a,
b,
c,
handle->cnnl_handle_pool},
op, algo, algoResult, a, b, c, handle->internal()
},
handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
......@@ -137,21 +133,21 @@ infiniStatus_t Descriptor::calculate(
if (_info.is_transed) {
std::swap(a, b);
}
use_cnnl(_opaque->cnnl_handle_pool,
(cnrtQueue_t)stream,
[&](cnnlHandle_t handle) {
cnnlBatchMatMulBCast_v2(
handle,
_opaque->op,
_opaque->algo,
&alpha,
_opaque->a, a,
_opaque->b, b,
&beta,
_opaque->c, c,
workspace,
workspace_size);
});
_opaque->internal->use_cnnl(
(cnrtQueue_t)stream,
[&](cnnlHandle_t handle) {
cnnlBatchMatMulBCast_v2(
handle,
_opaque->op,
_opaque->algo,
&alpha,
_opaque->a, a,
_opaque->b, b,
&beta,
_opaque->c, c,
workspace,
workspace_size);
});
cnrtQueueSync((cnrtQueue_t)stream);
return INFINI_STATUS_SUCCESS;
......
#include "../../utils.h"
#include "infinirt_bang.h"
#include "cnrt.h"
#define CHECK_BANGRT(RT_API) CHECK_INTERNAL(RT_API, cnrtSuccess)
namespace infinirt::bang {
infiniStatus_t getDeviceCount(int *count) {
CHECK_BANGRT(cnrtGetDeviceCount(count));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t setDevice(int device_id) {
CHECK_BANGRT(cnrtSetDevice(device_id));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t deviceSynchronize() {
CHECK_BANGRT(cnrtSyncDevice());
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t streamCreate(infinirtStream_t *stream_ptr) {
cnrtQueue_t queue;
CHECK_BANGRT(cnrtQueueCreate(&stream));
*stream_ptr = queue;
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t streamDestroy(infinirtStream_t stream) {
CHECK_BANGRT(cnrtQueueDestroy((cnrtQueue_t)stream));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t streamSynchronize(infinirtStream_t stream) {
CHECK_BANGRT(cnrtQueueSync((cnrtQueue_t)stream));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event) {
CHECK_BANGRT(cnrtQueueWaitNotifier((cnrtNotifier_t)event, (cnrtQueue_t)stream, 0));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t eventCreate(infinirtEvent_t *event_ptr) {
cnrtNotifier_t notifier;
CHECK_BANGRT(cnrtNotifierCreate(&notifier));
*event_ptr = notifier;
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t eventRecord(infinirtEvent_t event, infinirtStream_t stream) {
CHECK_BANGRT(cnrtPlaceNotifier((cnrtNotifier_t)event, (cnrtQueue_t)stream));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t eventQuery(infinirtEvent_t event, infinirtEventStatus_t *status_ptr) {
auto status = cnrtQueryNotifier((cnrtQueue_t)stream);
if (status == cnrtSuccess) {
*status_ptr = INFINIRT_EVENT_COMPLETE;
} else if (status == cnrtErrorBusy) {
*status_ptr = INFINIRT_EVENT_NOT_READY;
} else {
CHECK_BANGRT(status);
}
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t eventSynchronize(infinirtEvent_t event) {
CHECK_BANGRT(cnrtWaitNotifier((cnrtNotifier_t)event));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t eventDestroy(infinirtEvent_t event) {
CHECK_BANGRT(cnrtNotifierDestroy((cnrtNotifier_t)event));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t mallocDevice(void **p_ptr, size_t size) {
CHECK_BANGRT(cnrtMalloc(p_ptr, size));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t mallocHost(void **p_ptr, size_t size) {
CHECK_BANGRT(cnrtHostMalloc(p_ptr, size));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t freeDevice(void *ptr) {
CHECK_BANGRT(cnrtFree(ptr));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t freeHost(void *ptr) {
CHECK_BANGRT(cnrtFreeHost(ptr));
return INFINI_STATUS_SUCCESS;
}
cnrtMemTransDir_t toBangMemcpyKind(infinirtMemcpyKind_t kind) {
switch (kind) {
case INFINIRT_MEMCPY_H2D:
return cnrtMemcpyHostToDev;
case INFINIRT_MEMCPY_D2H:
return cnrtMemcpyDevToHost;
// Note: Bang has two types of D2D types,
// 1. cnrtMemcpyDevToDev: which is copy in a single device, and
// 2. cnrtMemcpyPeerToPeer: which is from a device to another.
// Here, cnrtMemcpyNoDirection is placed.
case INFINIRT_MEMCPY_D2D:
return cnrtMemcpyNoDirection;
case INFINIRT_MEMCPY_H2H:
return cnrtMemcpyHostToHost;
default:
return cnrtMemcpyNoDirection;
}
}
infiniStatus_t memcpy(void *dst, const void *src, size_t size, infinirtMemcpyKind_t kind) {
CHECK_BANGRT(cnrtMemcpy(dst, src, size, toBangMemcpyKind(kind)));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t memcpyAsync(void *dst, const void *src, size_t size, infinirtMemcpyKind_t kind, infinirtStream_t stream) {
CHECK_BANGRT(cnrtMemcpyAsync_V2(dst, src, size, (cnrtQueue_t)stream, toBangMemcpyKind(kind)));
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t mallocAsync(void **p_ptr, size_t size, infinirtStream_t stream) {
return INFINI_STATUS_NOT_IMPLEMENTED;
}
infiniStatus_t freeAsync(void *ptr, infinirtStream_t stream) {
return INFINI_STATUS_NOT_IMPLEMENTED;
}
} // namespace infinirt::bang
#ifndef __INFINIRT_BANG_H__
#define __INFINIRT_BANG_H__
#include "../infinirt_impl.h"
namespace infinirt::bang {
#ifdef ENABLE_BANG_API
INFINIRT_DEVICE_API_IMPL
#else
INFINIRT_DEVICE_API_NOOP
#endif
} // namespace infinirt::bang
#endif // __INFINIRT_BANG_H__
#include "infinirt.h"
#include "../utils.h"
#include "ascend/infinirt_ascend.h"
#include "bang/infinirt_bang.h"
#include "cpu/infinirt_cpu.h"
#include "cuda/infinirt_cuda.cuh"
......@@ -51,6 +52,9 @@ __C infiniStatus_t infinirtGetDevice(infiniDevice_t *device_ptr, int *device_id_
case INFINI_DEVICE_NVIDIA: \
_status = infinirt::cuda::API PARAMS; \
break; \
case INFINI_DEVICE_CAMBRICON: \
_status = infinirt::bang::API PARAMS; \
break; \
case INFINI_DEVICE_ASCEND: \
_status = infinirt::ascend::API PARAMS; \
break; \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment