Merge pull request #101 from PanZezhong1725/issue/89/bang

Issue/89/bang Refactor Handle, Runtime, and Matmul Implementation for Bang

Merge pull request #101 from PanZezhong1725/issue/89/bang
Issue/89/bang Refactor Handle, Runtime, and Matmul Implementation for Bang
fd0242ed · PanZezhong1725 · GitHub · 92ad2426 · 39b09a9e · fd0242ed
Unverified Commit fd0242ed authored Mar 12, 2025 by PanZezhong1725 Committed by GitHub Mar 12, 2025
8 changed files
--- a/src/infiniop/devices/bang/bang_handle.cc
+++ b/src/infiniop/devices/bang/bang_handle.cc
-#include "../pool.h"
+#include "../../tensor.h"
 #include "common_bang.h"
+#include "infiniop/tensor_descriptor.h"
 #include <memory>
+#include <vector>
-infiniStatus_t createBangHandle(infiniopBangHandle_t *handle_ptr) {
+namespace device::bang {
-    int device_id = 0;
-    if (cnrtGetDevice(&device_id) != cnrtSuccess) {
+Handle::Handle(infiniDevice_t device, int device_id)
-        return INFINI_STATUS_DEVICE_NOT_INITIALIZED;
+    : InfiniopHandle{device, device_id},
+      _internal(std::make_shared<Handle::Internal>()) {}
+auto Handle::internal() const -> const std::shared_ptr<Internal> & {
+    return _internal;
+}
+infiniStatus_t Handle::Internal::useCnnl(cnrtQueue_t queue, const Fn<cnnlHandle_t> &f) const {
+    auto handle = cnnl_handles.pop();
+    if (!handle) {
+        CHECK_BANG(cnnlCreate(&(*handle)));
    }
+    CHECK_BANG(cnnlSetQueue(*handle, queue));
+    CHECK_STATUS(f(*handle));
+    cnnl_handles.push(std::move(*handle));
+    return INFINI_STATUS_SUCCESS;
+}
-    auto pool = std::make_shared<Pool<cnnlHandle_t>>();
+cnnlDataType_t getCnnlDtype(infiniDtype_t dt) {
-    cnnlHandle_t handle;
+    switch (dt) {
-    cnnlCreate(&handle);
+    case INFINI_DTYPE_F32:
-    pool->push(std::move(handle));
+        return CNNL_DTYPE_FLOAT;
+    case INFINI_DTYPE_F64:
+        return CNNL_DTYPE_DOUBLE;
+    case INFINI_DTYPE_F16:
+        return CNNL_DTYPE_HALF;
+    case INFINI_DTYPE_I8:
+        return CNNL_DTYPE_INT8;
+    case INFINI_DTYPE_I32:
+        return CNNL_DTYPE_INT32;
+    case INFINI_DTYPE_U8:
+        return CNNL_DTYPE_UINT8;
+    case INFINI_DTYPE_BF16:
+        return CNNL_DTYPE_BFLOAT16;
+    case INFINI_DTYPE_I64:
+        return CNNL_DTYPE_INT64;
+    default:
+        return CNNL_DTYPE_INVALID;
+    }
+}
-    *handle_ptr = new InfiniopBangHandle{INFINI_DEVICE_CAMBRICON, device_id,
+infiniStatus_t setCnnlTensor(cnnlTensorDescriptor_t desc,
-                                         std::move(pool)};
+                             const InfiniopTensorDescriptor *layout) {
+    std::vector<int> dims(layout->ndim());
+    for (size_t i = 0; i < layout->ndim(); i++) {
+        dims[i] = static_cast<int>(layout->shape()[i]);
+    }
+    CHECK_BANG(cnnlSetTensorDescriptor(desc, CNNL_LAYOUT_ARRAY,
+                                       getCnnlDtype(layout->dtype()), dims.size(),
+                                       dims.data()));
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t setCnnlTensorEx(cnnlTensorDescriptor_t desc,
+                               const InfiniopTensorDescriptor *layout) {
+    std::vector<int> dim_size(layout->ndim()), dim_stride(layout->ndim());
+    for (size_t i = 0; i < layout->ndim(); i++) {
+        dim_size[i] = static_cast<int>(layout->shape()[i]);
+        dim_stride[i] = static_cast<int>(layout->strides()[i]);
+    }
+    CHECK_BANG(cnnlSetTensorDescriptorEx(
+        desc, CNNL_LAYOUT_ARRAY, getCnnlDtype(layout->dtype()),
+        dim_size.size(), dim_size.data(), dim_stride.data()));
    return INFINI_STATUS_SUCCESS;
 }
-infiniStatus_t destroyBangHandle(infiniopBangHandle_t handle) {
+namespace cambricon {
-    delete handle;
+Handle::Handle(int device_id)
+    : bang::Handle(INFINI_DEVICE_CAMBRICON, device_id) {}
+infiniStatus_t Handle::create(InfiniopHandle **handle_ptr, int device_id) {
+    *handle_ptr = new Handle(device_id);
    return INFINI_STATUS_SUCCESS;
 }
+} // namespace cambricon
+} // namespace device::bang
--- a/src/infiniop/devices/bang/bang_handle.h
+++ b/src/infiniop/devices/bang/bang_handle.h
-#ifndef BANG_HANDLE_H
+#ifndef __INFINIOP_BANG_HANDLE_H__
-#define BANG_HANDLE_H
+#define __INFINIOP_BANG_HANDLE_H__
 #include "../../handle.h"
+#include <memory>
-struct InfiniopBangHandle;
+namespace device::bang {
-typedef struct InfiniopBangHandle *infiniopBangHandle_t;
-infiniStatus_t createBangHandle(infiniopBangHandle_t *handle_ptr);
+struct Handle : public InfiniopHandle {
-infiniStatus_t destroyBangHandle(infiniopBangHandle_t handle);
+    class Internal;
+    auto internal() const -> const std::shared_ptr<Internal> &;
-#endif
+protected:
+    Handle(infiniDevice_t device, int device_id);
+private:
+    std::shared_ptr<Internal> _internal;
+};
+namespace cambricon {
+class Handle : public bang::Handle {
+    Handle(int device_id);
+public:
+    static infiniStatus_t create(InfiniopHandle **handle_ptr, int device_id);
+};
+} // namespace cambricon
+} // namespace device::bang
+#endif // __INFINIOP_BANG_HANDLE_H__
--- a/src/infiniop/devices/bang/common_bang.h
+++ b/src/infiniop/devices/bang/common_bang.h
@@ -6,89 +6,32 @@
 #include "bang_handle.h"
 #include "cnnl.h"
 #include "cnrt.h"
-#include "infiniop/tensor_descriptor.h"
+#include <functional>
-#include <memory>
-#include <vector>
-// the maximum NRAM memory is 1024 * 768
+#define CHECK_BANG(API) CHECK_INTERNAL(API, CNNL_STATUS_SUCCESS)
-#define NRAM_MAX_SIZE (1024 * 256)
-#define GDRAM_MAX_SIZE (1024 * 1024 * 1024)
+namespace device::bang {
-struct InfiniopBangHandle {
+class Handle::Internal {
-    infiniDevice_t device;
+    Pool<cnnlHandle_t> cnnl_handles;
-    int device_id;
-    std::shared_ptr<Pool<cnnlHandle_t>> cnnl_handle_pool;
-};
-inline cnnlDataType_t cnnlDataTypeConvert(infiniDtype_t dataType) {
+    template <typename T>
-    switch (dataType) {
+    using Fn = std::function<infiniStatus_t(T)>;
-    case INFINI_DTYPE_F32:
-        return CNNL_DTYPE_FLOAT;
-    case INFINI_DTYPE_F64:
-        return CNNL_DTYPE_DOUBLE;
-    case INFINI_DTYPE_F16:
-        return CNNL_DTYPE_HALF;
-    case INFINI_DTYPE_I8:
-        return CNNL_DTYPE_INT8;
-    case INFINI_DTYPE_I32:
-        return CNNL_DTYPE_INT32;
-    case INFINI_DTYPE_U8:
-        return CNNL_DTYPE_UINT8;
-    case INFINI_DTYPE_BF16:
-        return CNNL_DTYPE_BFLOAT16;
-    case INFINI_DTYPE_I64:
-        return CNNL_DTYPE_INT64;
-    default:
-        return CNNL_DTYPE_INVALID;
-    }
-}
-template <typename T>
+public:
-void use_cnnl(std::shared_ptr<Pool<cnnlHandle_t>> &pool, cnrtQueue_t queue,
+    infiniStatus_t useCnnl(cnrtQueue_t queue, const Fn<cnnlHandle_t> &f) const;
-              T const &f) {
+};
-    auto handle = pool->pop();
-    if (!handle) {
-        cnnlCreate(&(*handle));
-    }
-    cnnlSetQueue(*handle, (cnrtQueue_t)queue);
-    f(*handle);
-    pool->push(std::move(*handle));
-}
-template <typename T>
+cnnlDataType_t getCnnlDtype(infiniDtype_t dt);
-void use_cnnl(std::shared_ptr<Pool<cnnlHandle_t>> &pool, T const &f) {
-    auto handle = pool->pop();
-    if (!handle) {
-        cnnlCreate(&(*handle));
-    }
-    f(*handle);
-    pool->push(std::move(*handle));
-}
-// set cnnl tensor descriptor without strides11
+// set cnnl tensor descriptor without strides
-inline void setCnnlTensor(cnnlTensorDescriptor_t desc,
+infiniStatus_t setCnnlTensor(cnnlTensorDescriptor_t desc,
-                          const infiniopTensorDescriptor_t layout) {
+                             const InfiniopTensorDescriptor *layout);
-    std::vector<int> dims(layout->ndim);
-    for (size_t i = 0; i < layout->ndim; i++) {
-        dims[i] = static_cast<int>(layout->shape[i]);
-    }
-    cnnlSetTensorDescriptor(desc, CNNL_LAYOUT_ARRAY,
-                            cnnlDataTypeConvert(layout->dtype), dims.size(),
-                            dims.data());
-}
 // set cnnl tensor descriptor with strides
-inline void setCnnlTensorEx(cnnlTensorDescriptor_t desc,
+infiniStatus_t setCnnlTensorEx(cnnlTensorDescriptor_t desc,
-                            const infiniopTensorDescriptor_t layout) {
+                               const InfiniopTensorDescriptor *layout);
-    std::vector<int> dim_size(layout->ndim), dim_stride(layout->ndim);
-    for (size_t i = 0; i < layout->ndim; i++) {
+} // namespace device::bang
-        dim_size[i] = static_cast<int>(layout->shape[i]);
-        dim_stride[i] = static_cast<int>(layout->strides[i]);
-    }
-    cnnlSetTensorDescriptorEx(
-        desc, CNNL_LAYOUT_ARRAY, cnnlDataTypeConvert(layout->dtype),
-        dim_size.size(), dim_size.data(), dim_stride.data());
-}
 #endif // __COMMON_BANG_H__
--- a/src/infiniop/devices/handle.cc
+++ b/src/infiniop/devices/handle.cc
@@ -39,9 +39,7 @@ __C infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr) {
        CREATE(INFINI_DEVICE_NVIDIA, cuda::nvidia);
 #endif
 #ifdef ENABLE_CAMBRICON_API
-    case INFINI_DEVICE_CAMBRICON: {
+        CREATE(INFINI_DEVICE_CAMBRICON, bang::cambricon);
-        return createBangHandle((infiniopBangHandle_t *)handle_ptr);
-    }
 #endif
 #ifdef ENABLE_ASCEND_API
    case INFINI_DEVICE_ASCEND: {
@@ -76,9 +74,7 @@ __C infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
        DELETE(INFINI_DEVICE_NVIDIA, cuda::nvidia);
 #endif
 #ifdef ENABLE_CAMBRICON_API
-    case INFINI_DEVICE_CAMBRICON: {
+        DELETE(INFINI_DEVICE_CAMBRICON, bang::cambricon);
-        return destroyBangHandle((infiniopBangHandle_t)handle);
-    }
 #endif
 #ifdef ENABLE_ASCEND_API
    case INFINI_DEVICE_ASCEND: {

--- a/src/infiniop/ops/matmul/bang/matmul_bang.cc
+++ b/src/infiniop/ops/matmul/bang/matmul_bang.cc
 #include "matmul_bang.h"
-#include "../../../devices/bang/bang_handle.h"
 #include "../../../devices/bang/common_bang.h"
 #include <cnnl_extra.h>
@@ -10,7 +9,7 @@ struct Descriptor::Opaque {
    cnnlMatMulAlgo_t algo;
    cnnlMatMulHeuristicResult_t algoResult;
    cnnlTensorDescriptor_t a, b, c;
-    std::shared_ptr<Pool<cnnlHandle_t>> cnnl_handle_pool;
+    std::shared_ptr<device::bang::Handle::Internal> internal;
    ~Opaque() {
        cnnlDestroyTensorDescriptor(a);
@@ -22,7 +21,7 @@ struct Descriptor::Opaque {
    }
 };
-static void setMatrixTensorEx(
+static infiniStatus_t setMatrixTensorEx(
    cnnlTensorDescriptor_t desc,
    const BlasMatrix &matrix, infiniDtype_t dtype,
    bool trans = false) {
@@ -38,20 +37,21 @@ static void setMatrixTensorEx(
    case 3: {
        std::vector<int> dim_size = {batch, rows, cols};
        std::vector<int> dim_stride = {stride, row_stride, col_stride};
-        cnnlSetTensorDescriptorEx(
+        CHECK_BANG(cnnlSetTensorDescriptorEx(
            desc, CNNL_LAYOUT_ARRAY,
-            cnnlDataTypeConvert(dtype), dim_size.size(),
+            device::bang::getCnnlDtype(dtype), dim_size.size(),
-            dim_size.data(), dim_stride.data());
+            dim_size.data(), dim_stride.data()));
    } break;
    case 2: {
        std::vector<int> dim_size = {rows, cols};
        std::vector<int> dim_stride = {row_stride, col_stride};
-        cnnlSetTensorDescriptorEx(
+        CHECK_BANG(cnnlSetTensorDescriptorEx(
            desc, CNNL_LAYOUT_ARRAY,
-            cnnlDataTypeConvert(dtype), dim_size.size(),
+            device::bang::getCnnlDtype(dtype), dim_size.size(),
-            dim_size.data(), dim_stride.data());
+            dim_size.data(), dim_stride.data()));
    } break;
    }
+    return INFINI_STATUS_SUCCESS;
 }
 Descriptor::~Descriptor() {
@@ -64,8 +64,8 @@ infiniStatus_t Descriptor::create(
    infiniopTensorDescriptor_t c_desc,
    infiniopTensorDescriptor_t a_desc,
    infiniopTensorDescriptor_t b_desc) {
-    auto handle = reinterpret_cast<infiniopBangHandle_t>(handle_);
+    auto handle = reinterpret_cast<device::bang::cambricon::Handle *>(handle_);
-    auto dtype = c_desc->dtype;
+    auto dtype = c_desc->dtype();
    if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
        return INFINI_STATUS_BAD_TENSOR_DTYPE;
@@ -78,48 +78,47 @@ infiniStatus_t Descriptor::create(
    }
    cnnlTensorDescriptor_t a, b, c;
-    cnnlCreateTensorDescriptor(&a);
+    CHECK_BANG(cnnlCreateTensorDescriptor(&a));
-    cnnlCreateTensorDescriptor(&b);
+    CHECK_BANG(cnnlCreateTensorDescriptor(&b));
-    cnnlCreateTensorDescriptor(&c);
+    CHECK_BANG(cnnlCreateTensorDescriptor(&c));
-    setMatrixTensorEx(a, info.a_matrix, a_desc->dtype);
+    CHECK_STATUS(setMatrixTensorEx(a, info.a_matrix, a_desc->dtype()));
-    setMatrixTensorEx(b, info.b_matrix, b_desc->dtype);
+    CHECK_STATUS(setMatrixTensorEx(b, info.b_matrix, b_desc->dtype()));
-    setMatrixTensorEx(c, info.c_matrix, c_desc->dtype);
+    CHECK_STATUS(setMatrixTensorEx(c, info.c_matrix, c_desc->dtype()));
    cnnlMatMulDescriptor_t op;
    cnnlMatMulAlgo_t algo;
    cnnlMatMulHeuristicResult_t algoResult;
-    cnnlMatMulDescCreate(&op);
+    CHECK_BANG(cnnlMatMulDescCreate(&op));
-    cnnlMatMulAlgoCreate(&algo);
+    CHECK_BANG(cnnlMatMulAlgoCreate(&algo));
-    cnnlCreateMatMulHeuristicResult(&algoResult);
+    CHECK_BANG(cnnlCreateMatMulHeuristicResult(&algoResult));
    int32_t use_stride = true;
-    cnnlSetMatMulDescAttr(
+    CHECK_BANG(cnnlSetMatMulDescAttr(
        op,
        CNNL_MATMUL_USE_STRIDE,
        &use_stride,
-        sizeof(int32_t));
+        sizeof(int32_t)));
    int count = 0;
-    use_cnnl(handle->cnnl_handle_pool,
-             [&](cnnlHandle_t _handle) {
+    CHECK_STATUS(
-                 cnnlGetBatchMatMulAlgoHeuristic(
+        handle->internal()->useCnnl(
-                     _handle,
+            (cnrtQueue_t) nullptr,
-                     op, a, b, c,
+            [&](cnnlHandle_t _handle) {
-                     NULL, 1, &algoResult, &count);
+                CHECK_BANG(
-             });
+                    cnnlGetBatchMatMulAlgoHeuristic(
+                        _handle,
+                        op, a, b, c,
+                        NULL, 1, &algoResult, &count));
+                return INFINI_STATUS_SUCCESS;
+            }));
    size_t workspace_size;
-    cnnlGetBatchMatMulHeuristicResult(algoResult, algo, &workspace_size);
+    CHECK_BANG(cnnlGetBatchMatMulHeuristicResult(algoResult, algo, &workspace_size));
    *desc_ptr = new Descriptor(
        dtype, info, workspace_size,
        new Opaque{
-            op,
+            op, algo, algoResult, a, b, c, handle->internal()},
-            algo,
-            algoResult,
-            a,
-            b,
-            c,
-            handle->cnnl_handle_pool},
        handle->device, handle->device_id);
    return INFINI_STATUS_SUCCESS;
 }
@@ -137,21 +136,22 @@ infiniStatus_t Descriptor::calculate(
    if (_info.is_transed) {
        std::swap(a, b);
    }
-    use_cnnl(_opaque->cnnl_handle_pool,
+    CHECK_STATUS(_opaque->internal->useCnnl(
-             (cnrtQueue_t)stream,
+        (cnrtQueue_t)stream,
-             [&](cnnlHandle_t handle) {
+        [&](cnnlHandle_t handle) {
-                 cnnlBatchMatMulBCast_v2(
+            CHECK_BANG(cnnlBatchMatMulBCast_v2(
-                     handle,
+                handle,
-                     _opaque->op,
+                _opaque->op,
-                     _opaque->algo,
+                _opaque->algo,
-                     &alpha,
+                &alpha,
-                     _opaque->a, a,
+                _opaque->a, a,
-                     _opaque->b, b,
+                _opaque->b, b,
-                     &beta,
+                &beta,
-                     _opaque->c, c,
+                _opaque->c, c,
-                     workspace,
+                workspace,
-                     workspace_size);
+                workspace_size));
-             });
+            return INFINI_STATUS_SUCCESS;
+        }));
    cnrtQueueSync((cnrtQueue_t)stream);
    return INFINI_STATUS_SUCCESS;

--- a/src/infinirt/bang/infinirt_bang.cc
+++ b/src/infinirt/bang/infinirt_bang.cc
+#include "infinirt_bang.h"
+#include "../../utils.h"
+#include "cnrt.h"
+#define CHECK_BANGRT(RT_API) CHECK_INTERNAL(RT_API, cnrtSuccess)
+namespace infinirt::bang {
+infiniStatus_t getDeviceCount(int *count) {
+    CHECK_BANGRT(cnrtGetDeviceCount(count));
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t setDevice(int device_id) {
+    CHECK_BANGRT(cnrtSetDevice(device_id));
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t deviceSynchronize() {
+    CHECK_BANGRT(cnrtSyncDevice());
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t streamCreate(infinirtStream_t *stream_ptr) {
+    cnrtQueue_t queue;
+    CHECK_BANGRT(cnrtQueueCreate(&stream));
+    *stream_ptr = queue;
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t streamDestroy(infinirtStream_t stream) {
+    CHECK_BANGRT(cnrtQueueDestroy((cnrtQueue_t)stream));
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t streamSynchronize(infinirtStream_t stream) {
+    CHECK_BANGRT(cnrtQueueSync((cnrtQueue_t)stream));
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event) {
+    CHECK_BANGRT(cnrtQueueWaitNotifier((cnrtNotifier_t)event, (cnrtQueue_t)stream, 0));
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t eventCreate(infinirtEvent_t *event_ptr) {
+    cnrtNotifier_t notifier;
+    CHECK_BANGRT(cnrtNotifierCreate(&notifier));
+    *event_ptr = notifier;
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t eventRecord(infinirtEvent_t event, infinirtStream_t stream) {
+    CHECK_BANGRT(cnrtPlaceNotifier((cnrtNotifier_t)event, (cnrtQueue_t)stream));
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t eventQuery(infinirtEvent_t event, infinirtEventStatus_t *status_ptr) {
+    auto status = cnrtQueryNotifier((cnrtQueue_t)stream);
+    if (status == cnrtSuccess) {
+        *status_ptr = INFINIRT_EVENT_COMPLETE;
+    } else if (status == cnrtErrorBusy) {
+        *status_ptr = INFINIRT_EVENT_NOT_READY;
+    } else {
+        CHECK_BANGRT(status);
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t eventSynchronize(infinirtEvent_t event) {
+    CHECK_BANGRT(cnrtWaitNotifier((cnrtNotifier_t)event));
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t eventDestroy(infinirtEvent_t event) {
+    CHECK_BANGRT(cnrtNotifierDestroy((cnrtNotifier_t)event));
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t mallocDevice(void **p_ptr, size_t size) {
+    CHECK_BANGRT(cnrtMalloc(p_ptr, size));
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t mallocHost(void **p_ptr, size_t size) {
+    CHECK_BANGRT(cnrtHostMalloc(p_ptr, size));
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t freeDevice(void *ptr) {
+    CHECK_BANGRT(cnrtFree(ptr));
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t freeHost(void *ptr) {
+    CHECK_BANGRT(cnrtFreeHost(ptr));
+    return INFINI_STATUS_SUCCESS;
+}
+cnrtMemTransDir_t toBangMemcpyKind(infinirtMemcpyKind_t kind) {
+    switch (kind) {
+    case INFINIRT_MEMCPY_H2D:
+        return cnrtMemcpyHostToDev;
+    case INFINIRT_MEMCPY_D2H:
+        return cnrtMemcpyDevToHost;
+    case INFINIRT_MEMCPY_D2D:
+        return cnrtMemcpyDevToDev;
+    case INFINIRT_MEMCPY_H2H:
+        return cnrtMemcpyHostToHost;
+    default:
+        return cnrtMemcpyNoDirection;
+    }
+}
+infiniStatus_t memcpy(void *dst, const void *src, size_t size, infinirtMemcpyKind_t kind) {
+    CHECK_BANGRT(cnrtMemcpy(dst, src, size, toBangMemcpyKind(kind)));
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t memcpyAsync(void *dst, const void *src, size_t size, infinirtMemcpyKind_t kind, infinirtStream_t stream) {
+    CHECK_BANGRT(cnrtMemcpyAsync_V2(dst, src, size, (cnrtQueue_t)stream, toBangMemcpyKind(kind)));
+    return INFINI_STATUS_SUCCESS;
+}
+// Does not support async malloc. Use blocking-style malloc instead
+infiniStatus_t mallocAsync(void **p_ptr, size_t size, infinirtStream_t stream) {
+    CHECK_BANGRT(cnrtMalloc(p_ptr, size));
+    return INFINI_STATUS_SUCCESS;
+}
+// Does not support async free. Use blocking-style free instead
+infiniStatus_t freeAsync(void *ptr, infinirtStream_t stream) {
+    CHECK_BANGRT(cnrtFree(ptr));
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace infinirt::bang
--- a/src/infinirt/bang/infinirt_bang.h
+++ b/src/infinirt/bang/infinirt_bang.h
+#ifndef __INFINIRT_BANG_H__
+#define __INFINIRT_BANG_H__
+#include "../infinirt_impl.h"
+namespace infinirt::bang {
+#ifdef ENABLE_BANG_API
+INFINIRT_DEVICE_API_IMPL
+#else
+INFINIRT_DEVICE_API_NOOP
+#endif
+} // namespace infinirt::bang
+#endif // __INFINIRT_BANG_H__
--- a/src/infinirt/infinirt.cc
+++ b/src/infinirt/infinirt.cc
 #include "infinirt.h"
 #include "../utils.h"
 #include "ascend/infinirt_ascend.h"
+#include "bang/infinirt_bang.h"
 #include "cpu/infinirt_cpu.h"
 #include "cuda/infinirt_cuda.cuh"
@@ -51,6 +52,9 @@ __C infiniStatus_t infinirtGetDevice(infiniDevice_t *device_ptr, int *device_id_
        case INFINI_DEVICE_NVIDIA:                          \
            _status = infinirt::cuda::API PARAMS;           \
            break;                                          \
+        case INFINI_DEVICE_CAMBRICON:                       \
+            _status = infinirt::bang::API PARAMS;           \
+            break;                                          \
        case INFINI_DEVICE_ASCEND:                          \
            _status = infinirt::ascend::API PARAMS;         \
            break;                                          \