feat: cpu and cuda matmul

46da1a27 · PanZezhongQY · 46da1a27 · 46da1a27 · 46da1a27 · 46da1a27
Commit 46da1a27 authored Feb 11, 2025 by PanZezhongQY
20 changed files
--- a/include/infiniop/ops/rms_norm.h
+++ b/include/infiniop/ops/rms_norm.h
+#ifndef __INFINIOP_RMS_NORM_H__
+#define __INFINIOP_RMS_NORM_H__
+
+#include "../operator.h"
+
+typedef InfiniopDescriptor *infiniopRMSNormDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateRMSNormDescriptor(
+    infiniopHandle_t handle,
+    infiniopRMSNormDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t w_desc,
+    float epsilon);
+
+__C __export infiniopStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t desc, size_t *size);
+
+__C __export infiniopStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *workspace, size_t workspace_size,
+                                              void *y, void const *x, void const *w, void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t desc);
+
+#endif
--- a/include/infiniop/ops/rotary_embedding.h
+++ b/include/infiniop/ops/rotary_embedding.h
+#ifndef __INFINIOP_ROTARY_EMBEDDING_H__
+#define __INFINIOP_ROTARY_EMBEDDING_H__
+
+#include "../operator.h"
+
+typedef InfiniopDescriptor *infiniopRoPEDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateRoPEDescriptor(
+    infiniopHandle_t handle,
+    infiniopRoPEDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t t,
+    infiniopTensorDescriptor_t pos_ids,
+    infiniopTensorDescriptor_t sin_table,
+    infiniopTensorDescriptor_t cos_table);
+
+__C __export infiniopStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc, size_t *size);
+
+__C __export infiniopStatus_t infiniopRoPE(
+    infiniopRoPEDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *t,
+    void const *pos_ids,
+    void const *sin_table,
+    void const *cos_table,
+    void *stream);
+
+__C __export infiniopStatus_t infiniopDestroyRoPEDescriptor(infiniopRoPEDescriptor_t desc);
+
+#endif
--- a/include/infiniop/ops/swiglu.h
+++ b/include/infiniop/ops/swiglu.h
+#ifndef __INFINIOP_SWIGLU_H__
+#define __INFINIOP_SWIGLU_H__
+
+#include "../operator.h"
+
+typedef InfiniopDescriptor *infiniopSwiGLUDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateSwiGLUDescriptor(infiniopHandle_t handle,
+                                                             infiniopSwiGLUDescriptor_t *desc_ptr,
+                                                             infiniopTensorDescriptor_t c_desc,
+                                                             infiniopTensorDescriptor_t a_desc,
+                                                             infiniopTensorDescriptor_t b_desc);
+
+__C __export infiniopStatus_t infiniopSwiGLU(infiniopSwiGLUDescriptor_t desc,
+                                             void *c,
+                                             void const *a,
+                                             void const *b,
+                                             void *stream);
+
+__C __export infiniopStatus_t infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc);
+
+#endif
--- a/include/infiniop/status.h
+++ b/include/infiniop/status.h
+#ifndef __INFINIOP_STATUS__
+#define __INFINIOP_STATUS__
+
+typedef enum {
+    INFINIOP_STATUS_SUCCESS = 0,
+    INFINIOP_STATUS_INTERNAL_ERROR = 1,
+    INFINIOP_STATUS_BAD_PARAM = 2,
+    INFINIOP_STATUS_BAD_TENSOR_DTYPE = 3,
+    INFINIOP_STATUS_BAD_TENSOR_SHAPE = 4,
+    INFINIOP_STATUS_BAD_TENSOR_STRIDES = 5,
+    INFINIOP_STATUS_NULL_POINTER = 6,
+    INFINIOP_STATUS_INSUFFICIENT_WORKSPACE = 7,
+    INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED = 8,
+    INFINIOP_STATUS_BAD_DEVICE = 9,
+    INFINIOP_STATUS_UNDEFINED_BEHAVIOR = 10,
+} infiniopStatus_t;
+
+#endif
--- a/include/infiniop/tensor_descriptor.h
+++ b/include/infiniop/tensor_descriptor.h
+#ifndef __INFINIOP_TENSOR_DESCRIPTOR__
+#define __INFINIOP_TENSOR_DESCRIPTOR__
+
+#include "../infinicore.h"
+#include "./status.h"
+
+struct InfiniopTensorDescriptor {
+    // Datatype
+    infiniDtype_t dtype;
+    // Number of dimensions
+    size_t ndim;
+    // Shape of the tensor, ndim elements
+    size_t *shape;
+    // Stride of each dimension in elements, ndim elements
+    int64_t *strides;
+};
+
+typedef struct InfiniopTensorDescriptor *infiniopTensorDescriptor_t;
+
+__C __export infiniopStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescriptor_t *desc_ptr, size_t ndim, size_t const *shape, int64_t const *strides, infiniDtype_t dtype);
+
+__C __export infiniopStatus_t infiniopDestroyTensorDescriptor(infiniopTensorDescriptor_t desc);
+
+#endif// __INFINIOP_TENSOR_DESCRIPTOR__
--- a/include/infinirt.h
+++ b/include/infinirt.h
--- a/src/infiniop/devices/ascend/CMakeLists.txt
+++ b/src/infiniop/devices/ascend/CMakeLists.txt
+cmake_minimum_required(VERSION 3.16.0)
+
+# project information
+project(Ascend_C)
+set(SOC_VERSION "Ascend910B3" CACHE STRING "system on chip type")
+set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest" CACHE PATH "ASCEND CANN package installation directory")
+set(RUN_MODE "npu" CACHE STRING "run mode: npu")
+set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE)
+
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the cann package is installed.")
+endif()
+
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+ascendc_library(ascend_kernels STATIC
+    ../../ops/swiglu/ascend/swiglu_kernel.cpp
+    ../../ops/rotary_embedding/ascend/rotary_embedding_kernel.cpp
+    ../../ops/random_sample/ascend/random_sample_kernel.cpp
+)
+
--- a/src/infiniop/devices/ascend/Makefile
+++ b/src/infiniop/devices/ascend/Makefile
+.PHONY: build clean
+
+MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
+MKFILE_DIR := $(dir $(MKFILE_PATH))
+
+build:
+	mkdir -p build && cd build && cmake .. && make -j8
+
+clean:
+	rm -rf build
--- a/src/infiniop/devices/ascend/ascend_handle.cc
+++ b/src/infiniop/devices/ascend/ascend_handle.cc
+#include "ascend_handle.h"
+
+infiniopStatus_t createAscendHandle(AscendHandle_t *handle_ptr, int device_id) {
+    uint32_t device_count;
+    aclrtGetDeviceCount(&device_count);
+    if (device_id >= static_cast<int>(device_count)) {
+        return STATUS_BAD_DEVICE;
+    }
+
+    auto ret = aclrtSetDevice(device_id);
+    CHECK_RET(ret == ACL_SUCCESS,
+              LOG_PRINT("aclrtSetDevice failed. ERROR: %d\n", ret));
+
+    *handle_ptr = new AscendContext{DevAscendNpu, device_id};
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t deleteAscendHandle(AscendHandle_t handle_ptr) {
+    delete handle_ptr;
+
+    return STATUS_SUCCESS;
+}
--- a/src/infiniop/devices/ascend/ascend_handle.h
+++ b/src/infiniop/devices/ascend/ascend_handle.h
+#ifndef ASCEND_HANDLE_H
+#define ASCEND_HANDLE_H
+
+#include "common_ascend.h"
+#include "device.h"
+#include "status.h"
+#include <acl/acl.h>
+#include <acl/acl_base.h>
+#include <acl/acl_rt.h>
+#include <aclnn/acl_meta.h>
+#include <memory>
+
+struct AscendContext {
+    Device device;
+    int device_id;
+};
+typedef struct AscendContext *AscendHandle_t;
+
+infiniopStatus_t createAscendHandle(AscendHandle_t *handle_ptr, int device_id);
+
+infiniopStatus_t deleteAscendHandle(AscendHandle_t handle_ptr);
+
+#endif
--- a/src/infiniop/devices/ascend/common_ascend.cc
+++ b/src/infiniop/devices/ascend/common_ascend.cc
+#include "common_ascend.h"
+
+int64_t numElements(const int64_t *shape, int64_t num) {
+    int64_t numEle = 1;
+    for (int i = 0; i < num; i++) {
+        numEle *= shape[i];
+    }
+    return numEle;
+}
+
+infiniopStatus_t mallocWorkspace(void **workspaceAddr, uint64_t workspaceSize) {
+    *workspaceAddr = nullptr;
+    if (workspaceSize > 0) {
+        auto ret = aclrtMalloc(workspaceAddr, workspaceSize,
+                               ACL_MEM_MALLOC_HUGE_FIRST);
+        CHECK_RET(ret == ACL_SUCCESS,
+                  LOG_PRINT("aclrtMalloc failed. ERROR: %d\n", ret);
+                  return STATUS_EXECUTION_FAILED);
+    }
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t freeWorkspace(void *workspaceAddr) {
+    if (workspaceAddr != nullptr) {
+        auto ret = aclrtFree(workspaceAddr);
+        CHECK_RET(ret == ACL_SUCCESS,
+                  LOG_PRINT("aclrtFree failed, ERROR: %d\n", ret);
+                  return STATUS_EXECUTION_FAILED);
+    }
+    return STATUS_SUCCESS;
+}
+
+aclDataType toAclDataType(DT dt) {
+    if (dt == I8)
+        return aclDataType::ACL_INT8;
+    else if (dt == I16)
+        return aclDataType::ACL_INT16;
+    else if (dt == I32)
+        return aclDataType::ACL_INT32;
+    else if (dt == I64)
+        return aclDataType::ACL_INT64;
+    else if (dt == U8)
+        return aclDataType::ACL_UINT8;
+    else if (dt == U16)
+        return aclDataType::ACL_UINT16;
+    else if (dt == U32)
+        return aclDataType::ACL_UINT32;
+    else if (dt == U64)
+        return aclDataType::ACL_UINT64;
+    else if (dt == F16)
+        return aclDataType::ACL_FLOAT16;
+    else if (dt == BF16)
+        return aclDataType::ACL_BF16;
+    else if (dt == F32)
+        return aclDataType::ACL_FLOAT;
+    else if (dt == F64)
+        return aclDataType::ACL_DOUBLE;
+    else
+        return aclDataType::ACL_DT_UNDEFINED;
+}
+
+
+const char *dataTypeToString(aclDataType dtype) {
+    switch (dtype) {
+        case ACL_DT_UNDEFINED:
+            return "ACL_DT_UNDEFINED";
+        case ACL_FLOAT:
+            return "ACL_FLOAT";
+        case ACL_FLOAT16:
+            return "ACL_FLOAT16";
+        case ACL_INT8:
+            return "ACL_INT8";
+        case ACL_INT32:
+            return "ACL_INT32";
+        case ACL_UINT8:
+            return "ACL_UINT8";
+        case ACL_INT16:
+            return "ACL_INT16";
+        case ACL_UINT16:
+            return "ACL_UINT16";
+        case ACL_UINT32:
+            return "ACL_UINT32";
+        case ACL_INT64:
+            return "ACL_INT64";
+        case ACL_UINT64:
+            return "ACL_UINT64";
+        case ACL_DOUBLE:
+            return "ACL_DOUBLE";
+        case ACL_BOOL:
+            return "ACL_BOOL";
+        case ACL_STRING:
+            return "ACL_STRING";
+        case ACL_COMPLEX64:
+            return "ACL_COMPLEX64";
+        case ACL_COMPLEX128:
+            return "ACL_COMPLEX128";
+        case ACL_BF16:
+            return "ACL_BF16";
+        case ACL_INT4:
+            return "ACL_INT4";
+        case ACL_UINT1:
+            return "ACL_UINT1";
+        case ACL_COMPLEX32:
+            return "ACL_COMPLEX32";
+        default:
+            return "UNKNOWN";
+    }
+}
+
+const char *formatToString(aclFormat format) {
+    switch (format) {
+        case ACL_FORMAT_UNDEFINED:
+            return "ACL_FORMAT_UNDEFINED";
+        case ACL_FORMAT_NCHW:
+            return "ACL_FORMAT_NCHW";
+        case ACL_FORMAT_NHWC:
+            return "ACL_FORMAT_NHWC";
+        case ACL_FORMAT_ND:
+            return "ACL_FORMAT_ND";
+        case ACL_FORMAT_NC1HWC0:
+            return "ACL_FORMAT_NC1HWC0";
+        case ACL_FORMAT_FRACTAL_Z:
+            return "ACL_FORMAT_FRACTAL_Z";
+        case ACL_FORMAT_NC1HWC0_C04:
+            return "ACL_FORMAT_NC1HWC0_C04";
+        case ACL_FORMAT_HWCN:
+            return "ACL_FORMAT_HWCN";
+        case ACL_FORMAT_NDHWC:
+            return "ACL_FORMAT_NDHWC";
+        case ACL_FORMAT_FRACTAL_NZ:
+            return "ACL_FORMAT_FRACTAL_NZ";
+        case ACL_FORMAT_NCDHW:
+            return "ACL_FORMAT_NCDHW";
+        case ACL_FORMAT_NDC1HWC0:
+            return "ACL_FORMAT_NDC1HWC0";
+        case ACL_FRACTAL_Z_3D:
+            return "ACL_FRACTAL_Z_3D";
+        case ACL_FORMAT_NC:
+            return "ACL_FORMAT_NC";
+        case ACL_FORMAT_NCL:
+            return "ACL_FORMAT_NCL";
+        default:
+            return "UNKNOWN";
+    }
+}
--- a/src/infiniop/devices/ascend/common_ascend.h
+++ b/src/infiniop/devices/ascend/common_ascend.h
+#ifndef __COMMON_ASCEND_H__
+#define __COMMON_ASCEND_H__
+
+#include "operators.h"
+#include <acl/acl.h>
+#include <acl/acl_base.h>
+#include <acl/acl_rt.h>
+#include <cstdio>
+#include <functional>
+#include <inttypes.h>
+#include <numeric>
+#include <vector>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define CHECK_RET(cond, return_expr) \
+    do {                             \
+        if (!(cond)) {               \
+            return_expr;             \
+        }                            \
+    } while (0)
+
+#define LOG_PRINT(message, ...)         \
+    do {                                \
+        printf(message, ##__VA_ARGS__); \
+    } while (0)
+
+#ifdef __cplusplus
+};
+#endif
+
+int64_t numElements(const int64_t *shape, int64_t num);
+const char *dataTypeToString(aclDataType dtype);
+const char *formatToString(aclFormat format);
+infiniopStatus_t mallocWorkspace(void **workspaceAddr, uint64_t workspaceSize);
+infiniopStatus_t freeWorkspace(void *workspaceAddr);
+aclDataType toAclDataType(DT dt);
+
+#endif
--- a/src/infiniop/devices/ascend/tensor_aclnn.cc
+++ b/src/infiniop/devices/ascend/tensor_aclnn.cc
+#include "tensor_aclnn.h"
+#include "../../ops/utils.h"
+#include <algorithm>
+
+infiniopStatus_t aclnnTensorDescriptor::setDescriptor(aclDataType dtype, const std::vector<int64_t> &shape, const std::vector<int64_t> &strides) {
+    if (shape.size() != strides.size()) {
+        return STATUS_BAD_PARAM;
+    }
+    this->ndim = shape.size();
+    this->shape = std::vector<int64_t>(shape);
+    this->strides = std::vector<int64_t>(strides);
+    this->dataType = dtype;
+
+    // Set format
+    // TODO: Support other format
+    aclFormat format = aclFormat::ACL_FORMAT_ND;
+    this->format = format;
+
+    CHECK_STATUS(this->inferStorageShape(), STATUS_SUCCESS);
+
+    return STATUS_SUCCESS;
+}
+
+
+/// @brief Infer storage shape. For now this ruturns a 1D shape of the total tensor storage size.
+/// We don't see why higher dimensional storage shape is ever needed. To change if necesary.
+infiniopStatus_t aclnnTensorDescriptor::inferStorageShape() {
+    auto index = std::max_element(this->strides.begin(), this->strides.end());
+    uint64_t max_stride_index = std::distance(this->strides.begin(), index);
+    this->storageNdim = 1;
+    this->storageShape = std::vector<int64_t>({this->shape[max_stride_index] * this->strides[max_stride_index]});
+
+    return STATUS_SUCCESS;
+}
+
+/// @brief Set aclnnTensorDescriptor from infiniopTensorDescriptor
+/// @param y infiniopTensorDescriptor
+/// @return infiniopStatus_t
+infiniopStatus_t aclnnTensorDescriptor::fromInfiniOpTensorDescriptor(infiniopTensorDescriptor_t y) {
+    uint64_t ndim = y->ndim;
+    // Cast shape type
+    auto shape = std::vector<int64_t>(ndim);
+    auto strides = std::vector<int64_t>(ndim);
+    for (uint64_t i = 0; i < ndim; ++i) {
+        shape[i] = static_cast<int64_t>(y->shape[i]);
+        strides[i] = y->strides[i];
+    }
+    return setDescriptor(toAclDataType(y->dt), shape, strides);
+}
+
+/// @brief Wrapper of aclCreateTensor. Create aclTensor.
+/// See https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha001/apiref/appdevgapi/aclcppdevg_03_0168.html
+/// @param desc Alias of aclnnTensorDescriptor*.
+/// @param data Data ptr on device global mem.
+/// @param tensor Pointer of pointer of aclTensor.
+/// @return
+infiniopStatus_t aclnnTensorDescriptor::createTensor(void *data) {
+    if (this->t) {
+        return STATUS_SUCCESS;
+    }
+    this->t = aclCreateTensor(this->shape.data(),
+                              this->ndim,
+                              this->dataType,
+                              this->strides.data(),
+                              this->offset,
+                              this->format,
+                              this->storageShape.data(),
+                              this->storageNdim,
+                              data);
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t aclnnTensorDescriptor::destroyTensor() {
+    auto ret = aclDestroyTensor(this->t);
+    CHECK_RET(ret == ACL_SUCCESS,
+              LOG_PRINT("aclDesctroyTensor failed, ERROR: %d\n", ret);
+              return STATUS_EXECUTION_FAILED);
+    t = nullptr;
+
+    return STATUS_SUCCESS;
+}
+
+aclnnTensorDescriptor::~aclnnTensorDescriptor() {
+    if (this->t) {
+        destroyTensor();
+    }
+}
+
+/// @brief TensorDescriptor's string info
+/// @param desc Alias of aclnnTensorDescriptor*.
+/// @return String of aclnnTensorDescriptor.
+char *aclnnTensorDescriptor::toString() {
+
+    // Assume bufferSize
+    size_t bufferSize = 1024 + this->ndim * 40 + this->storageNdim * 40;
+    char *buffer = (char *) malloc(bufferSize);
+    if (!buffer) return NULL;
+
+    // Write info into buffer
+    char *ptr = buffer;
+    ptr += sprintf(ptr, "ndim: %" PRId64 "\n", this->ndim);
+
+    ptr += sprintf(ptr, "shape: [");
+    for (uint64_t i = 0; i < this->ndim; ++i) {
+        ptr += sprintf(ptr, "%" PRId64, this->shape[i]);
+        if (i < this->ndim - 1) {
+            ptr += sprintf(ptr, ", ");
+        }
+    }
+    ptr += sprintf(ptr, "]\n");
+
+    ptr += sprintf(ptr, "stride: [");
+    for (uint64_t i = 0; i < this->ndim; ++i) {
+        ptr += sprintf(ptr, "%" PRId64, this->strides[i]);
+        if (i < this->ndim - 1) {
+            ptr += sprintf(ptr, ", ");
+        }
+    }
+    ptr += sprintf(ptr, "]\n");
+
+    ptr += sprintf(ptr, "offset: %" PRId64 "\n", this->offset);
+    ptr += sprintf(ptr, "dataType: %s\n", dataTypeToString(this->dataType));
+    ptr += sprintf(ptr, "format: %s\n", formatToString(this->format));
+
+    ptr += sprintf(ptr, "storageShape: [");
+    for (int64_t i = 0; i < this->storageNdim; ++i) {
+        ptr += sprintf(ptr, "%" PRId64, this->storageShape[i]);
+        if (i < this->storageNdim - 1) {
+            ptr += sprintf(ptr, ", ");
+        }
+    }
+    ptr += sprintf(ptr, "]\n");
+
+    ptr += sprintf(ptr, "storageNdim: %" PRId64 "\n", this->storageNdim);
+
+    return buffer;
+}
--- a/src/infiniop/devices/ascend/tensor_aclnn.h
+++ b/src/infiniop/devices/ascend/tensor_aclnn.h
+#ifndef __ACLNN_TENSOR__
+#define __ACLNN_TENSOR__
+
+#include "./common_ascend.h"
+#include "operators.h"
+#include "tensor.h"
+#include "tensor/tensor_descriptor.h"
+#include <acl/acl.h>
+#include <acl/acl_base.h>
+#include <aclnn/acl_meta.h>
+#include <vector>
+
+// Aclnn tensor descriptor,
+// used to build aclTensor
+struct aclnnTensorDescriptor {
+    uint64_t ndim;
+    std::vector<int64_t> shape;
+    std::vector<int64_t> strides;
+    int64_t offset;
+    aclDataType dataType;
+    aclFormat format;
+    std::vector<int64_t> storageShape;
+    int64_t storageNdim;
+
+    aclTensor *t;
+
+    // Transfer from infiniOp DT to aclDataType
+    infiniopStatus_t setDescriptor(aclDataType dtype, const std::vector<int64_t> &shape, const std::vector<int64_t> &strides);
+    infiniopStatus_t inferStorageShape();
+    // Convert form InfiniOpTensorDescriptor
+    infiniopStatus_t fromInfiniOpTensorDescriptor(infiniopTensorDescriptor_t y_desc);
+    infiniopStatus_t createTensor(void *data = nullptr);
+    infiniopStatus_t destroyTensor();
+    ~aclnnTensorDescriptor();
+
+    char *toString();
+};
+
+typedef aclnnTensorDescriptor *aclnnTensorDescriptor_t;
+
+#endif
--- a/src/infiniop/devices/bang/bang_handle.cc
+++ b/src/infiniop/devices/bang/bang_handle.cc
+#include "bang_handle.h"
+
+infiniopStatus_t createBangHandle(infiniopBangHandle_t *handle_ptr, int device_id) {
+    unsigned int device_count;
+    cnrtGetDeviceCount(&device_count);
+    if (device_id >= static_cast<int>(device_count)) {
+        return INFINIOP_STATUS_BAD_DEVICE;
+    }
+
+    auto pool = std::make_shared<Pool<cnnlHandle_t>>();
+    if (cnrtSetDevice(device_id) != cnrtSuccess){
+        return INFINIOP_STATUS_BAD_DEVICE;
+    }
+    cnnlHandle_t handle;
+    cnnlCreate(&handle);
+    pool->push(std::move(handle));
+
+    *handle_ptr = new InfiniopBangHandle{INFINI_DEVICE_CAMBRICON, device_id, std::move(pool)};
+
+    return INFINIOP_STATUS_SUCCESS;
+}
--- a/src/infiniop/devices/bang/bang_handle.h
+++ b/src/infiniop/devices/bang/bang_handle.h
+#ifndef BANG_HANDLE_H
+#define BANG_HANDLE_H
+
+#include "../pool.h"
+#include "cnnl.h"
+#include "cnrt.h"
+#include "infiniop/handle.h"
+#include <memory>
+
+struct InfiniopBangHandle {
+    infiniDevice_t device;
+    int device_id;
+    std::shared_ptr<Pool<cnnlHandle_t>> cnnl_handles;
+};
+typedef struct InfiniopBangHandle *infiniopBangHandle_t;
+
+infiniopStatus_t createBangHandle(infiniopBangHandle_t *handle_ptr, int device_id);
+
+template<typename T>
+void use_cnnl(std::shared_ptr<Pool<cnnlHandle_t>> &pool, int device_id, cnrtQueue_t queue, T const &f) {
+    auto handle = pool->pop();
+    if (!handle) {
+        cnrtSetDevice(device_id);
+        cnnlCreate(&(*handle));
+    }
+    cnnlSetQueue(*handle, (cnrtQueue_t) queue);
+    f(*handle);
+    pool->push(std::move(*handle));
+}
+
+#endif
--- a/src/infiniop/devices/bang/common_bang.h
+++ b/src/infiniop/devices/bang/common_bang.h
+#ifndef __COMMON_BANG_H__
+#define __COMMON_BANG_H__
+
+#include "cnnl.h"
+#include "infinicore.h"
+#include <vector>
+
+const int NRAM_MAX_SIZE = 1024 * 256;//the maximum NRAM memory is 1024 * 768
+const int GDRAM_MAX_SIZE = 1024 * 1024 * 1024;
+
+// set cnnl tensor descriptor without strides11
+inline void setCnnlTensor(cnnlTensorDescriptor_t desc, const TensorDescriptor *layout) {
+    std::vector<int> dims(layout->ndim);
+    for (uint64_t i = 0; i < layout->ndim; i++) {
+        dims[i] = static_cast<int>(layout->shape[i]);
+    }
+    cnnlSetTensorDescriptor(desc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_HALF,
+                            dims.size(), dims.data());
+}
+
+// set cnnl tensor descriptor with strides
+inline void setCnnlTensorEx(cnnlTensorDescriptor_t desc, const TensorDescriptor *layout) {
+    std::vector<int> dim_size(layout->ndim), dim_stride(layout->ndim);
+    for (uint64_t i = 0; i < layout->ndim; i++) {
+        dim_size[i] = static_cast<int>(layout->shape[i]);
+        dim_stride[i] = static_cast<int>(layout->strides[i] / layout->dt.size);
+    }
+    cnnlSetTensorDescriptorEx(desc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_HALF,
+                              dim_size.size(), dim_size.data(), dim_stride.data());
+}
+
+inline cnnlDataType_t cnnlDataTypeConvert(infiniDtype_t dataType) {
+    if (dtype_eq(dataType, INFINI_DTYPE_F32)) {
+        return CNNL_DTYPE_FLOAT;
+    } else if (dtype_eq(dataType, INFINI_DTYPE_F64)) {
+        return CNNL_DTYPE_DOUBLE;
+    } else if (dtype_eq(dataType, INFINI_DTYPE_F16)) {
+        return CNNL_DTYPE_HALF;
+    } else if (dtype_eq(dataType, INFINI_DTYPE_I8)) {
+        return CNNL_DTYPE_INT8;
+    } else if (dtype_eq(dataType, INFINI_DTYPE_I32)) {
+        return CNNL_DTYPE_INT32;
+    } else if (dtype_eq(dataType, INFINI_DTYPE_U8)) {
+        return CNNL_DTYPE_UINT8;
+    } else if (dtype_eq(dataType, INFINI_DTYPE_BF16)) {
+        return CNNL_DTYPE_BFLOAT16;
+    } else if (dtype_eq(dataType, INFINI_DTYPE_I64)) {
+        return CNNL_DTYPE_INT64;
+    } else {
+        return CNNL_DTYPE_INVALID;
+    }
+}
+
+#endif// __COMMON_BANG_H__
--- a/src/infiniop/devices/cpu/common_cpu.cc
+++ b/src/infiniop/devices/cpu/common_cpu.cc
+#include "./common_cpu.h"
+
+float f16_to_f32(uint16_t h) {
+    uint32_t sign = (h & 0x8000) << 16; // Extract the sign bit
+    int32_t exponent = (h >> 10) & 0x1F;// Extract the exponent
+    uint32_t mantissa = h & 0x3FF;      // Extract the mantissa (fraction part)
+
+    if (exponent == 31) {// Special case for Inf and NaN
+        if (mantissa != 0) {
+            // NaN: Set float32 NaN
+            uint32_t f32 = sign | 0x7F800000 | (mantissa << 13);
+            return *(float *) &f32;
+        } else {
+            // Infinity
+            uint32_t f32 = sign | 0x7F800000;
+            return *(float *) &f32;
+        }
+    } else if (exponent == 0) {// Subnormal float16 or zero
+        if (mantissa == 0) {
+            // Zero (positive or negative)
+            uint32_t f32 = sign;// Just return signed zero
+            return *(float *) &f32;
+        } else {
+            // Subnormal: Convert to normalized float32
+            exponent = -14;                  // Set exponent for subnormal numbers
+            while ((mantissa & 0x400) == 0) {// Normalize mantissa
+                mantissa <<= 1;
+                exponent--;
+            }
+            mantissa &= 0x3FF;// Clear the leading 1 bit
+            uint32_t f32 = sign | ((exponent + 127) << 23) | (mantissa << 13);
+            return *(float *) &f32;
+        }
+    } else {
+        // Normalized float16
+        uint32_t f32 = sign | ((exponent + 127 - 15) << 23) | (mantissa << 13);
+        return *(float *) &f32;
+    }
+}
+
+uint16_t f32_to_f16(float val) {
+    uint32_t f32 = *(uint32_t *) &val;            // Read the bits of the float32
+    uint16_t sign = (f32 >> 16) & 0x8000;         // Extract the sign bit
+    int32_t exponent = ((f32 >> 23) & 0xFF) - 127;// Extract and de-bias the exponent
+    uint32_t mantissa = f32 & 0x7FFFFF;           // Extract the mantissa (fraction part)
+
+    if (exponent >= 31) {// Special cases for Inf and NaN
+        // NaN
+        if (exponent == 128 && mantissa != 0) {
+            return sign | 0x7E00;
+        }
+        // Infinity
+        return sign | 0x7C00;
+    } else if (exponent >= -14) {// Normalized case
+        return sign | ((exponent + 15) << 10) | (mantissa >> 13);
+    } else if (exponent >= -24) {
+        mantissa |= 0x800000;// Add implicit leading 1
+        mantissa >>= (-14 - exponent);
+        return sign | (mantissa >> 13);
+    } else {
+        // Too small for subnormal: return signed zero
+        return sign;
+    }
+}
+
+size_t indexToReducedOffset(size_t flat_index, size_t ndim, int64_t const *broadcasted_strides, int64_t const *target_strides) {
+    size_t res = 0;
+    for (size_t i = 0; i < ndim; ++i) {
+        res += flat_index / broadcasted_strides[i] * target_strides[i];
+        flat_index %= broadcasted_strides[i];
+    }
+    return res;
+}
+
+size_t indexToOffset(size_t flat_index, size_t ndim, size_t const *shape, int64_t const *strides) {
+    size_t res = 0;
+    for (size_t i = ndim; i-- >= 0;) {
+        res += (flat_index % shape[i]) * strides[i];
+        flat_index /= shape[i];
+    }
+    return res;
+}
+
+size_t getPaddedSize(size_t ndim, size_t *shape, size_t const *pads) {
+    uint64_t total_size = 1;
+    for (size_t i = 0; i < ndim; ++i) {
+        total_size *= shape[i] + (i < 2 ? 0 : 2 * pads[i - 2]);
+    }
+    return total_size;
+}
+
+std::vector<size_t> getPaddedShape(size_t ndim, size_t const *shape, size_t const *pads) {
+    std::vector<size_t> padded_shape(ndim);
+    memcpy(padded_shape.data(), shape, ndim * sizeof(size_t));
+    for (size_t i = 2; i < ndim; ++i) {
+        padded_shape[i] += 2 * pads[i - 2];
+    }
+    return std::move(padded_shape);
+}
--- a/src/infiniop/devices/cpu/common_cpu.h
+++ b/src/infiniop/devices/cpu/common_cpu.h
+#ifndef __INFINIOP__COMMON_CPU_H__
+#define __INFINIOP__COMMON_CPU_H__
+
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+// convert half-precision float to single-precision float
+float f16_to_f32(uint16_t code);
+
+// convert single-precision float to half-precision float
+uint16_t f32_to_f16(float val);
+
+// return the memory offset of original tensor, given the flattened index of broadcasted tensor
+size_t indexToReducedOffset(size_t flat_index, size_t ndim, int64_t const *broadcasted_strides, int64_t const *target_strides);
+
+// return the memory offset a tensor given flattened index
+size_t indexToOffset(size_t flat_index, size_t ndim, size_t const *shape, int64_t const *strides);
+
+/**
+ * get the total array size (element count) after applying padding for a 
+ * ndim-ary tensor with the given shape
+ */
+size_t getPaddedSize(size_t ndim, size_t *shape, size_t const *pads);
+
+// calculate the padded shape and store the result in padded_shape
+std::vector<size_t> getPaddedShape(size_t ndim, size_t const *shape, size_t const *pads);
+
+#endif// __INFINIOP__COMMON_CPU_H__
--- a/src/infiniop/devices/cpu/cpu_handle.cc
+++ b/src/infiniop/devices/cpu/cpu_handle.cc
+#include "./cpu_handle.h"
+
+infiniopStatus_t createCpuHandle(infiniopCpuHandle_t* handle_ptr){
+    *handle_ptr = new InfiniopHandle{INFINI_DEVICE_CPU, 0};
+    return INFINIOP_STATUS_SUCCESS;
+}