feat: cpu and cuda matmul

46da1a27 · PanZezhongQY · 46da1a27 · 46da1a27 · 46da1a27 · 46da1a27
Commit 46da1a27 authored Feb 11, 2025 by PanZezhongQY
20 changed files
--- a/src/infiniop/devices/cpu/cpu_handle.h
+++ b/src/infiniop/devices/cpu/cpu_handle.h
+#ifndef __INFINIOP_CPU_HANDLE_H__
+#define __INFINIOP_CPU_HANDLE_H__
+
+#include "infiniop/handle.h"
+
+typedef infiniopHandle_t infiniopCpuHandle_t;
+
+infiniopStatus_t createCpuHandle(infiniopCpuHandle_t *handle_ptr);
+
+#endif
--- a/src/infiniop/devices/cuda/common_cuda.cuh
+++ b/src/infiniop/devices/cuda/common_cuda.cuh
+#ifndef __INFINIOP_COMMON_CUDA_H__
+#define __INFINIOP_COMMON_CUDA_H__
+
+#define MAX_THREADS_PER_BLOCK 1024
+#define MAX_WARP_PER_BLOCK 32
+#define WARP_SIZE 32
+
+#include <iostream>
+
+#define checkCudaErrorWithCode(call, errorCode)                                \
+    do {                                                                       \
+        if (auto status = call; status != cudaSuccess) {                       \
+            std::cerr << "CUDA error: " << cudaGetErrorString(status)          \
+                      << " in file " << __FILE__ << ", function " << __func__  \
+                      << ", line " << __LINE__ << std::endl;                   \
+            return errorCode;                                                  \
+        }                                                                      \
+    } while (0)
+
+#define checkCudaError(call) checkCudaErrorWithCode(call, INFINIOP_STATUS_BAD_DEVICE)
+
+#define checkCudnnError(call)                                                  \
+    do {                                                                       \
+        if (auto status = call; status != CUDNN_STATUS_SUCCESS) {              \
+            std::cerr << "CUDNN error: " << cudnnGetErrorString(status)        \
+                      << " in file " << __FILE__ << ", function " << __func__  \
+                      << ", line " << __LINE__ << std::endl;                   \
+            return INFINIOP_STATUS_INTERNAL_ERROR;                             \
+        }                                                                      \
+    } while (0)
+
+#include "infinicore.h"
+#include <cudnn.h>
+#include <cublas_v2.h>
+#include <memory>
+#include "../pool.h"
+#include "cuda_handle.h"
+#include <cuda_fp16.h>
+
+struct InfiniopCudaHandle {
+    infiniDevice_t device;
+    int device_id;
+    std::shared_ptr<Pool<cublasHandle_t>> cublas_handles_t;
+    std::shared_ptr<Pool<cudnnHandle_t>> cudnn_handles_t;
+    cudaDeviceProp prop;
+    int compute_capability_major;
+    int compute_capability_minor;
+};
+
+template<typename T>
+void use_cublas(std::shared_ptr<Pool<cublasHandle_t>> cublas_handles_t, int device_id, cudaStream_t stream, T const &f) {
+    auto handle = cublas_handles_t->pop();
+    if (!handle) {
+        cudaSetDevice(device_id);
+        cublasCreate(&(*handle));
+    }
+    cublasSetStream(*handle, (cudaStream_t) stream);
+    f(*handle);
+    cublas_handles_t->push(std::move(*handle));
+}
+
+template<typename T>
+cudnnStatus_t use_cudnn(std::shared_ptr<Pool<cudnnHandle_t>> cudnn_handles_t, int device_id, cudaStream_t stream, T const &f) {
+    auto handle = cudnn_handles_t->pop();
+    if (!handle) {
+        cudaSetDevice(device_id);
+        cudnnCreate(&(*handle));
+    }
+    cudnnSetStream(*handle, stream);
+    cudnnStatus_t status = f(*handle);
+    cudnn_handles_t->push(std::move(*handle));
+    return status;
+}
+
+inline cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
+    switch (dt) {
+    case INFINI_DTYPE_F16:
+        return CUDNN_DATA_HALF;
+    case INFINI_DTYPE_F32:
+        return CUDNN_DATA_FLOAT;
+    case INFINI_DTYPE_F64:
+        return CUDNN_DATA_DOUBLE;
+    case INFINI_DTYPE_BF16:
+        return CUDNN_DATA_BFLOAT16;
+    case INFINI_DTYPE_I8:
+        return CUDNN_DATA_INT8;
+    case INFINI_DTYPE_I32:
+        return CUDNN_DATA_INT32;
+    case INFINI_DTYPE_I64:
+        return CUDNN_DATA_INT64;
+    case INFINI_DTYPE_U8:
+        return CUDNN_DATA_UINT8;
+    default:
+        return CUDNN_DATA_FLOAT;
+    }
+}
+
+// return the memory offset of original tensor, given the flattened index of
+// broadcasted tensor
+inline __device__ __host__ size_t indexToReducedOffset(
+    size_t flat_index, size_t ndim, int64_t const *broadcasted_strides,
+    int64_t const *target_strides) {
+    size_t res = 0;
+    for (size_t i = 0; i < ndim; ++i) {
+        res += flat_index / broadcasted_strides[i] * target_strides[i];
+        flat_index %= broadcasted_strides[i];
+    }
+    return res;
+}
+
+// get the memory offset of the given element in a tensor given its flat index
+inline __device__ __host__ size_t indexToOffset(size_t flat_index, size_t ndim,
+                                                size_t const *shape,
+                                                int64_t const *strides) {
+    size_t res = 0;
+    for (size_t i = ndim; i-- > 0;) {
+        res += (flat_index % shape[i]) * strides[i];
+        flat_index /= shape[i];
+    }
+    return res;
+}
+
+#endif // __INFINIOP_COMMON_CUDA_H__
--- a/src/infiniop/devices/cuda/cuda_handle.cu
+++ b/src/infiniop/devices/cuda/cuda_handle.cu
+#include "./common_cuda.cuh"
+
+infiniopStatus_t createCudaHandle(infiniopCudaHandle_t *handle_ptr, int device_id, infiniDevice_t cuda_device_type) {
+    // Check if device_id is valid
+    int device_count;
+    cudaGetDeviceCount(&device_count);
+    if (device_id >= device_count) {
+        return INFINIOP_STATUS_BAD_DEVICE;
+    }
+
+    // Create a new cublas handle pool
+    auto pool = std::make_shared<Pool<cublasHandle_t>>();
+    if (cudaSetDevice(device_id) != cudaSuccess) {
+        return INFINIOP_STATUS_BAD_DEVICE;
+    }
+    cublasHandle_t handle;
+    cublasCreate(&handle);
+    pool->push(std::move(handle));
+
+    // create a cudnn handle pool
+    auto cudnn_pool = std::make_shared<Pool<cudnnHandle_t>>();
+    cudnnHandle_t cudnn_handle;
+    checkCudnnError(cudnnCreate(&cudnn_handle));
+    cudnn_pool->push(std::move(cudnn_handle));
+
+    // set CUDA device property
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, device_id);
+
+    // set device compute capability numbers
+    int capability_major;
+    int capability_minor;
+    cudaDeviceGetAttribute(&capability_major, cudaDevAttrComputeCapabilityMajor, device_id);
+    cudaDeviceGetAttribute(&capability_minor, cudaDevAttrComputeCapabilityMinor, device_id);
+
+    *handle_ptr = new InfiniopCudaHandle{
+        cuda_device_type,
+        device_id,
+        std::move(pool),
+        std::move(cudnn_pool),
+        std::move(prop),
+        capability_major,
+        capability_minor,
+    };
+
+    return INFINIOP_STATUS_SUCCESS;
+}
+
+infiniopStatus_t deleteCudaHandle(infiniopCudaHandle_t handle_ptr) {
+    handle_ptr->cublas_handles_t = nullptr;
+    handle_ptr->cudnn_handles_t = nullptr;
+    delete handle_ptr;
+
+    return INFINIOP_STATUS_SUCCESS;
+}
--- a/src/infiniop/devices/cuda/cuda_handle.h
+++ b/src/infiniop/devices/cuda/cuda_handle.h
+#ifndef __INFINIOP_CUDA_HANDLE_H__
+#define __INFINIOP_CUDA_HANDLE_H__
+
+#include "infiniop/handle.h"
+
+struct InfiniopCudaHandle;
+typedef struct InfiniopCudaHandle *infiniopCudaHandle_t;
+
+infiniopStatus_t createCudaHandle(infiniopCudaHandle_t *handle_ptr, int device_id, infiniDevice_t cuda_device_type);
+
+infiniopStatus_t deleteCudaHandle(infiniopCudaHandle_t handle_ptr);
+
+#endif
--- a/src/infiniop/devices/handle.cc
+++ b/src/infiniop/devices/handle.cc
+#include "infiniop/handle.h"
+#ifdef ENABLE_CPU_API
+#include "./cpu/cpu_handle.h"
+#endif
+#ifdef ENABLE_CUDA_API
+#include "./cuda/cuda_handle.h"
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+#include "./bang/bang_handle.h"
+#endif
+#ifdef ENABLE_ASCEND_NPU
+#include "./ascend/ascend_handle.h"
+#endif
+
+
+__C infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr, infiniDevice_t device, int device_id) {
+    if (handle_ptr == nullptr) {
+        return INFINIOP_STATUS_NULL_POINTER;
+    }
+    if (device_id < 0) {
+        return INFINIOP_STATUS_BAD_DEVICE;
+    }
+
+    switch (device) {
+#ifdef ENABLE_CPU_API
+        case INFINI_DEVICE_CPU:
+            return createCpuHandle((infiniopCpuHandle_t *) handle_ptr);
+#endif
+#ifdef ENABLE_CUDA_API
+        case INFINI_DEVICE_NVIDIA: {
+            return createCudaHandle((infiniopCudaHandle_t *) handle_ptr, device_id, device);
+        }
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        case DevCambriconMlu: {
+            return createBangHandle((infiniopBangHandle_t *) handle_ptr, device_id);
+        }
+#endif
+#ifdef ENABLE_ASCEND_API
+        case DevAscendNpu: {
+            return createAscendHandle((infiniopAscendHandle_t *) handle_ptr, device_id);
+        }
+#endif
+    }
+    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+
+__C infiniopStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        case INFINI_DEVICE_CPU:
+            delete handle;
+            return INFINIOP_STATUS_SUCCESS;
+#endif
+#ifdef ENABLE_CUDA_API
+        case INFINI_DEVICE_NVIDIA: {
+            return deleteCudaHandle((infiniopCudaHandle_t) handle);
+        }
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            delete (infiniopBangHandle_t) handle;
+            return STATUS_SUCCESS;
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return deleteAscendHandle((infiniopAscendHandle_t) handle);
+        }
+#endif
+    }
+    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
\ No newline at end of file
--- a/src/infiniop/devices/pool.h
+++ b/src/infiniop/devices/pool.h
+#ifndef __POOL_H__
+#define __POOL_H__
+
+#include <atomic>
+#include <mutex>
+#include <optional>
+
+template<class T>
+class Pool {
+public:
+    Pool() : _head(nullptr) {}
+
+    Pool(const Pool &) = delete;
+
+    Pool(Pool &&pool) noexcept : _head(pool._head.exchange(nullptr)) {}
+
+    ~Pool() {
+        while (this->pop()) {}
+    }
+
+    void push(T &&val) const {
+        Node<T> *new_node = new Node<T>(std::move(val));
+        new_node->next = _head.load();
+        while (!_head.compare_exchange_weak(new_node->next, new_node));
+    }
+
+    std::optional<T> pop() const {
+        Node<T> *top = _head.load();
+        Node<T> *new_head = nullptr;
+        do {
+            if (!top) {
+                return std::nullopt;
+            }
+            new_head = top->next;
+        } while (!_head.compare_exchange_weak(top, new_head));
+        return {std::move(top->data)};
+    }
+
+private:
+    template<class U>
+    struct Node {
+        U data;
+        Node<U> *next;
+        Node(U &&data) : data(data), next(nullptr) {}
+    };
+
+    mutable std::atomic<Node<T> *> _head;
+};
+
+#endif // __POOL_H__
--- a/src/infiniop/ops/matmul/ascend/matmul_aclnn.cc
+++ b/src/infiniop/ops/matmul/ascend/matmul_aclnn.cc
+#include "matmul_aclnn.h"
+
+MatmulAclnnDescriptor::MatmulAclnnDescriptor(Device _device) {
+    device = _device;
+    device_id = 0;
+    executor = nullptr;
+    info = nullptr;
+    cDesc = new aclnnTensorDescriptor();
+    aDesc = new aclnnTensorDescriptor();
+    bDesc = new aclnnTensorDescriptor();
+    alpha = 1.0;
+    beta = 0;
+    mt = 1;
+    workspaceSize = 0;
+}
+
+infiniopStatus_t aclnnCreateMatmulDescriptor(AscendHandle_t handle,
+                                             MatmulAclnnDescriptor_t *desc_ptr,
+                                             infiniopTensorDescriptor_t c_desc,
+                                             float alpha,
+                                             infiniopTensorDescriptor_t a_desc,
+                                             infiniopTensorDescriptor_t b_desc,
+                                             float beta,
+                                             int8_t mt) {
+    DT dtype = c_desc->dt;
+    if (dtype != F16 && dtype != F32) {
+        return STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    *desc_ptr = new MatmulAclnnDescriptor(handle->device);
+    (*desc_ptr)->device_id = handle->device_id;
+    (*desc_ptr)->dtype = dtype;
+    (*desc_ptr)->mt = mt;
+    (*desc_ptr)->alpha = alpha;
+    (*desc_ptr)->beta = beta;
+    infiniopStatus_t *status = new infiniopStatus_t{STATUS_EXECUTION_FAILED};
+    auto info = new MatmulInfo(c_desc, a_desc, b_desc, status, false);
+    if (*status != STATUS_SUCCESS) {
+        return *status;
+    }
+    (*desc_ptr)->info = info;
+
+    auto &cDesc = (*desc_ptr)->cDesc;
+    auto &aDesc = (*desc_ptr)->aDesc;
+    auto &bDesc = (*desc_ptr)->bDesc;
+
+    // Treat A, B, C as 2D matrix, reuse aclnnTensorDescriptor for batched operation
+    CHECK_STATUS(cDesc->setDescriptor(toAclDataType(c_desc->dt), {info->c_matrix.rows, info->c_matrix.cols}, {info->c_matrix.row_stride, info->c_matrix.col_stride}), STATUS_SUCCESS);
+    CHECK_STATUS(aDesc->setDescriptor(toAclDataType(a_desc->dt), {info->a_matrix.rows, info->a_matrix.cols}, {info->a_matrix.row_stride, info->a_matrix.col_stride}), STATUS_SUCCESS);
+    CHECK_STATUS(bDesc->setDescriptor(toAclDataType(b_desc->dt), {info->b_matrix.rows, info->b_matrix.cols}, {info->b_matrix.row_stride, info->b_matrix.col_stride}), STATUS_SUCCESS);
+
+    CHECK_STATUS(cDesc->createTensor(), STATUS_SUCCESS);
+    CHECK_STATUS(aDesc->createTensor(), STATUS_SUCCESS);
+    CHECK_STATUS(bDesc->createTensor(), STATUS_SUCCESS);
+
+
+    auto &workspaceSize = (*desc_ptr)->workspaceSize;
+    auto &executor = (*desc_ptr)->executor;
+
+    aclTensor *tc = cDesc->t;
+    aclTensor *ta = aDesc->t;
+    aclTensor *tb = bDesc->t;
+
+    aclnnStatus ret;
+
+
+    int64_t transA = 0;
+    int64_t transB = 0;
+    // aclnnGemm support C = alpha * A @ B + beta * C
+    // see https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/aolapi/context/aclnnGemm.md
+    ret = aclnnGemmGetWorkspaceSize(ta, tb, tc, (*desc_ptr)->alpha, (*desc_ptr)->beta, transA, transB, tc,
+                                    (*desc_ptr)->mt, &workspaceSize, &executor);
+    CHECK_RET(ret == ACL_SUCCESS,
+              LOG_PRINT("aclnnGemmGetWorkspaceSize failed. ERROR: %d\n", ret);
+              return STATUS_EXECUTION_FAILED);
+    aclSetAclOpExecutorRepeatable(executor);
+
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t aclnnGetMatmulWorkspaceSize(MatmulAclnnDescriptor_t desc,
+                                             uint64_t *size) {
+    *size = desc->workspaceSize;
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t aclnnMatmul(MatmulAclnnDescriptor_t desc,
+                             void *workspace,
+                             uint64_t workspace_size,
+                             void *c,
+                             void const *a,
+                             void const *b,
+                             void *stream) {
+    auto &cDesc = desc->cDesc;
+    auto &aDesc = desc->aDesc;
+    auto &bDesc = desc->bDesc;
+
+    aclTensor *tc = cDesc->t;
+    aclTensor *ta = aDesc->t;
+    aclTensor *tb = bDesc->t;
+
+    auto batch = desc->info->batch;
+
+    auto &executor = desc->executor;
+    auto &workspaceSize = desc->workspaceSize;
+
+    // Set runing on handle device
+    aclrtSetDevice(desc->device_id);
+
+    for (int i = 0; i < batch; i++) {
+        AclSetTensorAddr(executor, 0, ta, (char *) (a) + i * desc->info->a_matrix.stride * desc->dtype.size);
+        AclSetTensorAddr(executor, 1, tb, (char *) (b) + i * desc->info->b_matrix.stride * desc->dtype.size);
+        AclSetTensorAddr(executor, 2, tc, (char *) (c) + i * desc->info->c_matrix.stride * desc->dtype.size);
+        AclSetTensorAddr(executor, 3, tc, (char *) (c) + i * desc->info->c_matrix.stride * desc->dtype.size);
+        aclnnStatus ret = aclnnGemm(workspace,
+                                    workspaceSize,
+                                    executor,
+                                    stream);
+        CHECK_RET(ret == ACL_SUCCESS,
+                  LOG_PRINT("aclnnGemm failed. ERROR: %d\n", ret);
+                  return STATUS_EXECUTION_FAILED);
+    }
+
+    return STATUS_SUCCESS;
+}
+
+
+infiniopStatus_t aclnnDestroyMatmulDescriptor(MatmulAclnnDescriptor_t desc) {
+    delete desc->cDesc;
+    delete desc->bDesc;
+    delete desc->aDesc;
+    delete desc->info;
+    aclDestroyAclOpExecutor(desc->executor);
+    delete desc;
+
+    return STATUS_SUCCESS;
+}
--- a/src/infiniop/ops/matmul/ascend/matmul_aclnn.h
+++ b/src/infiniop/ops/matmul/ascend/matmul_aclnn.h
+#ifndef __ACLNN_MATMUL_H__
+#define __ACLNN_MATMUL_H__
+
+#include "../../../devices/ascend/ascend_handle.h"
+#include "../../../devices/ascend/tensor_aclnn.h"
+#include "../../utils.h"
+#include "../blas.h"
+#include "operators.h"
+#include <acl/acl_base.h>
+#include <aclnn/acl_meta.h>
+#include <aclnnop/level2/aclnn_gemm.h>
+#include <aclnnop/aclnn_matmul.h>
+
+struct MatmulAclnnDescriptor {
+    Device device;
+    int device_id;
+    aclOpExecutor* executor;
+    MatmulInfo* info;
+    DT dtype;
+    aclnnTensorDescriptor_t cDesc, aDesc, bDesc;
+    // cubeMathType
+    // see doc: https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha002/apiref/appdevgapi/context/aclnnBatchMatMul.md
+    float alpha;
+    float beta;
+    int8_t mt;
+    uint64_t workspaceSize;
+
+    MatmulAclnnDescriptor(Device _device);
+};
+
+typedef struct MatmulAclnnDescriptor *MatmulAclnnDescriptor_t;
+
+infiniopStatus_t aclnnCreateMatmulDescriptor(AscendHandle_t handle,
+                                             MatmulAclnnDescriptor_t *desc_ptr,
+                                             infiniopTensorDescriptor_t c_desc,
+                                             float alpha,
+                                             infiniopTensorDescriptor_t a_desc,
+                                             infiniopTensorDescriptor_t b_desc,
+                                             float beta,
+                                             int8_t cubeMathType);
+
+infiniopStatus_t aclnnGetMatmulWorkspaceSize(MatmulAclnnDescriptor_t desc,
+                                             uint64_t *size);
+
+infiniopStatus_t aclnnMatmul(MatmulAclnnDescriptor_t desc,
+                             void *workspace,
+                             uint64_t workspace_size,
+                             void *c,
+                             const void *a,
+                             const void *b,
+                             void *stream);
+
+infiniopStatus_t aclnnDestroyMatmulDescriptor(MatmulAclnnDescriptor_t desc);
+
+#endif
--- a/src/infiniop/ops/matmul/bang/matmul_cnnl.cc
+++ b/src/infiniop/ops/matmul/bang/matmul_cnnl.cc
+#include "matmul_cnnl.h"
+#include "../../../devices/bang/bang_handle.h"
+#include "../../../devices/bang/common_bang.h"
+#include "../../utils.h"
+#include "cnrt.h"
+infiniopStatus_t bangCreateMatmulDescriptor(BangHandle_t handle,
+                                            MatmulBangDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t c_desc,
+                                            float alpha,
+                                            infiniopTensorDescriptor_t a_desc,
+                                            infiniopTensorDescriptor_t b_desc,
+                                            float beta) {
+    infiniopStatus_t *status = new infiniopStatus_t{STATUS_EXECUTION_FAILED};
+    auto info = MatmulInfo(c_desc, a_desc, b_desc, status, false);
+    if (*status != STATUS_SUCCESS) {
+        return *status;
+    }
+    cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+    cnnlCreateTensorDescriptor(&aDesc);
+    cnnlCreateTensorDescriptor(&bDesc);
+    cnnlCreateTensorDescriptor(&cDesc);
+
+    setMatrixTensorEx(aDesc, info.a_matrix);
+    setMatrixTensorEx(bDesc, info.b_matrix);
+    setMatrixTensorEx(cDesc, info.c_matrix);
+
+    cnnlMatMulDescriptor_t opDesc;
+    cnnlMatMulAlgo_t algo;
+    cnnlMatMulHeuristicResult_t algoResult;
+    cnnlMatMulDescCreate(&opDesc);
+    cnnlMatMulAlgoCreate(&algo);
+    cnnlCreateMatMulHeuristicResult(&algoResult);
+    int32_t use_stride = true;
+    cnnlSetMatMulDescAttr(opDesc, CNNL_MATMUL_USE_STRIDE, &use_stride,
+                          sizeof(int32_t));
+    *desc_ptr = new MatmulBangDescriptor{
+        handle->device,
+        handle->device_id,
+        info,
+        alpha,
+        beta,
+        c_desc->dt,
+        handle->cnnl_handles,
+        aDesc,
+        bDesc,
+        cDesc,
+        opDesc,
+        algo,
+        algoResult};
+    return STATUS_SUCCESS;
+}
+infiniopStatus_t bangGetMatmulWorkspaceSize(MatmulBangDescriptor_t desc, uint64_t *size) {
+    *size = 0;
+    return STATUS_SUCCESS;
+}
+
+infiniopStatus_t bangDestroyMatmulDescriptor(MatmulBangDescriptor_t desc) {
+    desc->cnnl_handles = nullptr;
+    cnnlDestroyTensorDescriptor(desc->aDesc);
+    cnnlDestroyTensorDescriptor(desc->bDesc);
+    cnnlDestroyTensorDescriptor(desc->cDesc);
+    cnnlMatMulDescDestroy(desc->opDesc);
+    cnnlMatMulAlgoDestroy(desc->algo);
+    cnnlDestroyMatMulHeuristicResult(desc->algoResult);
+    delete desc;
+    return STATUS_SUCCESS;
+}
+
+void matmul_cnnl_f16(MatmulBangDescriptor_t desc, void *workspace, void *c, float beta, void const *a, void const *b, float alpha, void *stream) {
+    auto info = desc->info;
+    if (info.is_transed) {
+        std::swap(a, b);
+    }
+
+    use_cnnl(desc->cnnl_handles, desc->device_id, (cnrtQueue_t) stream,
+             [&](cnnlHandle_t handle) {
+                 int count = 0;
+                 cnnlGetBatchMatMulAlgoHeuristic(handle, desc->opDesc, desc->aDesc,
+                                                 desc->bDesc, desc->cDesc,
+                                                 NULL, 1, &desc->algoResult, &count);
+                 size_t wsSize;
+                 cnnlGetBatchMatMulHeuristicResult(desc->algoResult, desc->algo, &wsSize);
+                 cnrtMalloc(&workspace, wsSize);
+                 cnnlBatchMatMulBCast_v2(handle, desc->opDesc, desc->algo,
+                                         &alpha, desc->aDesc, a,
+                                         desc->bDesc, b,
+                                         &beta, desc->cDesc, c,
+                                         workspace, wsSize);
+             });
+}
+infiniopStatus_t bangMatmul(MatmulBangDescriptor_t desc, void *workspace, uint64_t workspace_size, void *c, void const *a, void const *b, void *stream) {
+    if (cnrtSetDevice(desc->device_id) != cnrtSuccess) {
+        return STATUS_BAD_DEVICE;
+    }
+    float alpha = desc->alpha;
+    float beta = desc->beta;
+    if (dtype_eq(desc->dtype, F16)) {
+        matmul_cnnl_f16(desc, workspace, c, beta, a, b, alpha, stream);
+        cnrtQueueSync((cnrtQueue_t)stream);
+        return STATUS_SUCCESS;
+    }
+    return STATUS_BAD_TENSOR_DTYPE;
+}
--- a/src/infiniop/ops/matmul/bang/matmul_cnnl.h
+++ b/src/infiniop/ops/matmul/bang/matmul_cnnl.h
+#ifndef __CNNL_MATMUL_H__
+#define __CNNL_MATMUL_H__
+#include "../../../devices/bang/bang_handle.h"
+#include "../blas.h"
+#include "cnnl.h"
+#include "cnnl_extra.h"
+#include "operators.h"
+
+struct MatmulBangDescriptor {
+    Device device;
+    int device_id;
+    MatmulInfo info;
+    float alpha;
+    float beta;
+    DT dtype;
+    std::shared_ptr<Pool<cnnlHandle_t>> cnnl_handles;
+    cnnlTensorDescriptor_t aDesc;
+    cnnlTensorDescriptor_t bDesc;
+    cnnlTensorDescriptor_t cDesc;
+    cnnlMatMulDescriptor_t opDesc;
+    cnnlMatMulAlgo_t algo;
+    cnnlMatMulHeuristicResult_t algoResult;
+};
+typedef struct MatmulBangDescriptor *MatmulBangDescriptor_t;
+
+infiniopStatus_t bangCreateMatmulDescriptor(BangHandle_t handle,
+                                            MatmulBangDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t c_desc,
+                                            float alpha,
+                                            infiniopTensorDescriptor_t a_desc,
+                                            infiniopTensorDescriptor_t b_desc,
+                                            float beta);
+
+infiniopStatus_t bangGetMatmulWorkspaceSize(MatmulBangDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t bangMatmul(MatmulBangDescriptor_t desc, void *workspace, uint64_t workspace_size, void *c, void const *a, void const *b, void *stream);
+
+infiniopStatus_t bangDestroyMatmulDescriptor(MatmulBangDescriptor_t desc);
+
+inline void setMatrixTensorEx(cnnlTensorDescriptor_t desc, const BlasMatrix &matrix, bool trans = false) {
+    int ndim = matrix.ndim;
+    int batch = matrix.batch;
+    int stride = static_cast<int>(matrix.stride);
+    int rows = matrix.rows;
+    int cols = matrix.cols;
+    int row_stride = matrix.row_stride;
+    int col_stride = matrix.col_stride;
+
+    if (ndim == 3) {
+        std::vector<int> dim_size = {batch, rows, cols};
+        std::vector<int> dim_stride = {stride, row_stride, col_stride};
+        cnnlSetTensorDescriptorEx(desc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_HALF,
+                                  dim_size.size(), dim_size.data(), dim_stride.data());
+    } else if (ndim == 2) {
+        std::vector<int> dim_size = {rows, cols};
+        std::vector<int> dim_stride = {row_stride, col_stride};
+        cnnlSetTensorDescriptorEx(desc, CNNL_LAYOUT_ARRAY, CNNL_DTYPE_HALF,
+                                  dim_size.size(), dim_size.data(), dim_stride.data());
+    }
+}
+
+
+#endif// __CNNL_MATMUL_H__
--- a/src/infiniop/ops/matmul/blas.h
+++ b/src/infiniop/ops/matmul/blas.h
+#ifndef __BLAS_H__
+#define __BLAS_H__
+
+#include "../utils.h"
+#include "infiniop/operator.h"
+#include <algorithm>
+#include <stdint.h>
+
+typedef struct BlasMatrix {
+    size_t ndim;
+    size_t batch;
+    int64_t stride;
+    size_t rows;
+    size_t cols;
+    int64_t row_stride;
+    int64_t col_stride;
+
+    BlasMatrix() {}
+
+    BlasMatrix(infiniopTensorDescriptor_t layout, infiniopStatus_t *status) {
+        if (layout->ndim == 2) {
+            this->ndim = 2;
+            this->batch = 1;
+            this->stride = 0;
+            this->rows = layout->shape[0];
+            this->cols = layout->shape[1];
+            this->row_stride = layout->strides[0];
+            this->col_stride = layout->strides[1];
+        } else if (layout->ndim == 3) {
+            this->ndim = 3;
+            this->batch = layout->shape[0];
+            this->stride = this->batch == 1 ? 0 : layout->strides[0];
+            this->rows = layout->shape[1];
+            this->cols = layout->shape[2];
+            this->row_stride = layout->strides[1];
+            this->col_stride = layout->strides[2];
+        } else {
+            *status = INFINIOP_STATUS_BAD_TENSOR_SHAPE;
+            return;
+        }
+
+        if (this->row_stride != 1 && this->col_stride != 1) {
+            *status = INFINIOP_STATUS_BAD_TENSOR_STRIDES;
+            return;
+        }
+
+        *status = INFINIOP_STATUS_SUCCESS;
+    }
+
+    bool match_batch(int batch) const {
+        return this->batch == batch || this->batch == 1;
+    }
+
+    void transpose() {
+        std::swap(rows, cols);
+        std::swap(row_stride, col_stride);
+    }
+
+    int ld() const {
+        if (this->row_stride == 1) {
+            return this->col_stride;
+        } else {
+            return this->row_stride;
+        }
+    }
+} BlasMatrix;
+
+struct MatmulInfo {
+    BlasMatrix a_matrix;
+    BlasMatrix b_matrix;
+    BlasMatrix c_matrix;
+
+    size_t m, n, k, batch;
+
+    bool is_transed = false;
+
+    MatmulInfo(infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t b_desc, infiniopStatus_t *status, bool col_major = true) {
+        a_matrix = BlasMatrix(a_desc, status);
+        if (*status != INFINIOP_STATUS_SUCCESS) {
+            return;
+        }
+        b_matrix = BlasMatrix(b_desc, status);
+        if (*status != INFINIOP_STATUS_SUCCESS) {
+            return;
+        }
+        c_matrix = BlasMatrix(c_desc, status);
+        if (*status != INFINIOP_STATUS_SUCCESS) {
+            return;
+        }
+
+        if (c_matrix.rows != a_matrix.rows || c_matrix.cols != b_matrix.cols || a_matrix.cols != b_matrix.rows){
+            *status = INFINIOP_STATUS_BAD_TENSOR_SHAPE;
+            return;
+        }
+
+        batch = c_matrix.batch;
+        if (!a_matrix.match_batch(batch) || !b_matrix.match_batch(batch)) {
+            *status = INFINIOP_STATUS_BAD_TENSOR_SHAPE;
+            return;
+        }
+
+        if ((col_major && c_matrix.col_stride == 1) || (!col_major && c_matrix.row_stride == 1)) {
+            c_matrix.transpose();
+            b_matrix.transpose();
+            a_matrix.transpose();
+            std::swap(a_matrix, b_matrix);
+            is_transed = true;
+        }
+
+        m = c_matrix.rows;
+        n = c_matrix.cols;
+        k = a_matrix.cols;
+    }
+};
+
+#endif// __BLAS_H__
--- a/src/infiniop/ops/matmul/cpu/matmul_cpu.cc
+++ b/src/infiniop/ops/matmul/cpu/matmul_cpu.cc
+#include "./matmul_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../utils.h"
+#include <cmath>
+
+infiniopStatus_t cpuCreateMatmulDescriptor(infiniopCpuHandle_t handle,
+                                           MatmulCpuDescriptor_t *desc_ptr,
+                                           infiniopTensorDescriptor_t c_desc,
+                                           infiniopTensorDescriptor_t a_desc,
+                                           infiniopTensorDescriptor_t b_desc) {
+    infiniDtype_t dtype = c_desc->dtype;
+
+    if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
+        return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    infiniopStatus_t status;
+    auto info = MatmulInfo(c_desc, a_desc, b_desc, &status);
+    if (status != INFINIOP_STATUS_SUCCESS) {
+        return status;
+    }
+
+    *desc_ptr = new MatmulCpuDescriptor{
+        INFINI_DEVICE_CPU,
+        dtype,
+        info};
+
+    return INFINIOP_STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuGetMatmulWorkspaceSize(MatmulCpuDescriptor_t desc, uint64_t *size) {
+    *size = 0;
+    return INFINIOP_STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuDestroyMatmulDescriptor(MatmulCpuDescriptor_t desc) {
+    delete desc;
+    return INFINIOP_STATUS_SUCCESS;
+}
+
+template<typename Tdata>
+infiniopStatus_t matmul_cpu(MatmulCpuDescriptor_t desc, void *c, float beta, void const *a, void const *b, float alpha) {
+    auto info = desc->info;
+
+    if (info.is_transed) {
+        std::swap(a, b);
+    }
+
+    for (int i = 0; i < info.batch; ++i) {
+        for (int m_ = 0; m_ < info.m; ++m_) {
+            for (int n_ = 0; n_ < info.n; ++n_) {
+                auto c_ = reinterpret_cast<Tdata *>(c) + i * info.c_matrix.stride + m_ * info.c_matrix.row_stride + n_ * info.c_matrix.col_stride;
+                float sum = 0;
+                for (int k_ = 0; k_ < info.k; ++k_) {
+                    auto a_ = reinterpret_cast<Tdata const *>(a) + i * info.a_matrix.stride + m_ * info.a_matrix.row_stride + k_ * info.a_matrix.col_stride;
+                    auto b_ = reinterpret_cast<Tdata const *>(b) + i * info.b_matrix.stride + n_ * info.b_matrix.col_stride + k_ * info.b_matrix.row_stride;
+                    if constexpr (std::is_same<Tdata, uint16_t>::value) {
+                        sum += f16_to_f32(*a_) * f16_to_f32(*b_);
+                    } else {
+                        sum += *a_ * (*b_);
+                    }
+                }
+                if constexpr (std::is_same<Tdata, uint16_t>::value) {
+                    if (beta == 0) {
+                        *c_ = f32_to_f16(alpha * sum);
+                    } else {
+                        *c_ = f32_to_f16(beta * f16_to_f32(*c_) + alpha * sum);
+                    }
+                } else {
+                    *c_ = beta * (*c_) + alpha * sum;
+                }
+            }
+        }
+    }
+    return INFINIOP_STATUS_SUCCESS;
+}
+
+infiniopStatus_t cpuMatmul(MatmulCpuDescriptor_t desc,
+                           void *workspace,
+                           uint64_t workspace_size,
+                           void *c,
+                           void const *a,
+                           void const *b,
+                           float alpha,
+                           float beta) {
+    if (desc->dtype == INFINI_DTYPE_F16) {
+        return matmul_cpu<uint16_t>(desc, c, beta, a, b, alpha);
+    }
+    if (desc->dtype == INFINI_DTYPE_F32) {
+        return matmul_cpu<float>(desc, c, beta, a, b, alpha);
+    }
+    return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
+}
--- a/src/infiniop/ops/matmul/cpu/matmul_cpu.h
+++ b/src/infiniop/ops/matmul/cpu/matmul_cpu.h
+#ifndef __INFINIOP_MATMUL_CPU_H__
+#define __INFINIOP_MATMUL_CPU_H__
+
+#include "../../../devices/cpu/cpu_handle.h"
+#include "../blas.h"
+#include "infiniop/operator.h"
+
+typedef struct MatmulCpuDescriptor {
+    infiniDevice_t device;
+    infiniDtype_t dtype;
+    MatmulInfo info;
+} MatmulCpuDescriptor;
+
+typedef struct MatmulCpuDescriptor *MatmulCpuDescriptor_t;
+
+infiniopStatus_t cpuCreateMatmulDescriptor(infiniopCpuHandle_t handle,
+                                           MatmulCpuDescriptor_t *desc_ptr,
+                                           infiniopTensorDescriptor_t c_desc,
+                                           infiniopTensorDescriptor_t a_desc,
+                                           infiniopTensorDescriptor_t b_desc);
+
+infiniopStatus_t cpuGetMatmulWorkspaceSize(MatmulCpuDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t cpuMatmul(MatmulCpuDescriptor_t desc,
+                           void *workspace,
+                           uint64_t workspace_size,
+                           void *c,
+                           void const *a,
+                           void const *b,
+                           float alpha,
+                           float beta);
+
+infiniopStatus_t cpuDestroyMatmulDescriptor(MatmulCpuDescriptor_t desc);
+
+#endif// __INFINIOP_MATMUL_CPU_H__
--- a/src/infiniop/ops/matmul/cuda/matmul_cuda.cu
+++ b/src/infiniop/ops/matmul/cuda/matmul_cuda.cu
+#include "./matmul_cuda.cuh"
+#include "../../utils.h"
+
+infiniopStatus_t cudaCreateMatmulDescriptor(infiniopCudaHandle_t handle,
+                                            infiniopMatmulCudaDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t c_desc,
+                                            infiniopTensorDescriptor_t a_desc,
+                                            infiniopTensorDescriptor_t b_desc) {
+    infiniDtype_t dtype = c_desc->dtype;
+
+    if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
+        return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    infiniopStatus_t status;
+    auto info = MatmulInfo(c_desc, a_desc, b_desc, &status);
+    if (status != INFINIOP_STATUS_SUCCESS) {
+        return status;
+    }
+
+    *desc_ptr = new InfiniopMatmulCudaDescriptor{
+        handle->device,
+        dtype,
+        handle->device_id,
+        info,
+        handle->cublas_handles_t};
+    return INFINIOP_STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaGetMatmulWorkspaceSize(infiniopMatmulCudaDescriptor_t desc, uint64_t *size) {
+    *size = 0;
+    return INFINIOP_STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaDestroyMatmulDescriptor(infiniopMatmulCudaDescriptor_t desc) {
+    desc->cublas_handles_t = nullptr;
+    delete desc;
+    return INFINIOP_STATUS_SUCCESS;
+}
--- a/src/infiniop/ops/matmul/cuda/matmul_cuda.cuh
+++ b/src/infiniop/ops/matmul/cuda/matmul_cuda.cuh
+#ifndef __INFINIOP_MATMUL_CUDA_H__
+#define __INFINIOP_MATMUL_CUDA_H__
+
+#include "matmul_cuda_api.h"
+#include "../../../devices/cuda/common_cuda.cuh"
+#include <memory>
+#include "../blas.h"
+
+typedef struct InfiniopMatmulCudaDescriptor {
+    infiniDevice_t device;
+    infiniDtype_t dtype;
+    int device_id;
+    MatmulInfo info;
+    std::shared_ptr<Pool<cublasHandle_t>> cublas_handles_t;
+} InfiniopMatmulCudaDescriptor;
+
+#endif// __INFINIOP_MATMUL_CUDA_H__
--- a/src/infiniop/ops/matmul/cuda/matmul_cuda_api.h
+++ b/src/infiniop/ops/matmul/cuda/matmul_cuda_api.h
+#ifndef __INFINIOP_MATMUL_CUDA_API_H__
+#define __INFINIOP_MATMUL_CUDA_API_H__
+
+#include "../../../devices/cuda/cuda_handle.h"
+#include "infiniop/operator.h"
+
+
+struct InfiniopMatmulCudaDescriptor;
+typedef struct InfiniopMatmulCudaDescriptor *infiniopMatmulCudaDescriptor_t;
+
+infiniopStatus_t cudaCreateMatmulDescriptor(infiniopCudaHandle_t handle,
+                                            infiniopMatmulCudaDescriptor_t *desc_ptr,
+                                            infiniopTensorDescriptor_t c_desc,
+                                            infiniopTensorDescriptor_t a_desc,
+                                            infiniopTensorDescriptor_t b_desc);
+
+infiniopStatus_t cudaGetMatmulWorkspaceSize(infiniopMatmulCudaDescriptor_t desc, uint64_t *size);
+
+infiniopStatus_t cudaMatmul(infiniopMatmulCudaDescriptor_t desc,
+                            void *workspace,
+                            uint64_t workspace_size,
+                            void *c,
+                            void const *a,
+                            void const *b,
+                            float alpha,
+                            float beta,
+                            void *stream);
+
+infiniopStatus_t cudaDestroyMatmulDescriptor(infiniopMatmulCudaDescriptor_t desc);
+
+
+#endif // __INFINIOP_MATMUL_CUDA_API_H__
--- a/src/infiniop/ops/matmul/cuda/matmul_cuda_kernel.cu
+++ b/src/infiniop/ops/matmul/cuda/matmul_cuda_kernel.cu
+#include "../../utils.h"
+#include "./matmul_cuda.cuh"
+
+template<typename Tdata>
+infiniopStatus_t matmul_cuda(infiniopMatmulCudaDescriptor_t desc, void *c, float beta, void const *a, void const *b, float alpha, void *stream) {
+    auto info = desc->info;
+
+    if (info.is_transed) {
+        std::swap(a, b);
+    }
+
+    cudaDataType a_type, b_type, c_type;
+    cublasComputeType_t compute_type;
+    if constexpr (std::is_same<Tdata, half>::value) {
+        a_type = b_type = c_type = CUDA_R_16F;
+        compute_type = CUBLAS_COMPUTE_32F;
+    } else {
+        a_type = b_type = c_type = CUDA_R_32F;
+#ifdef ENABLE_SUGON_CUDA_API
+        compute_type = CUBLAS_COMPUTE_32F;
+#else
+        compute_type = CUBLAS_COMPUTE_32F_FAST_TF32;
+#endif
+    }
+
+    auto op_a = info.a_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
+    auto op_b = info.b_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+    use_cublas(desc->cublas_handles_t, desc->device_id, (cudaStream_t) stream,
+               [&](cublasHandle_t handle) { cublasGemmStridedBatchedEx(
+                                                handle,
+                                                op_a,
+                                                op_b,
+                                                info.m,
+                                                info.n,
+                                                info.k,
+                                                &alpha,
+                                                a,
+                                                a_type,
+                                                info.a_matrix.ld(),
+                                                info.a_matrix.stride,
+                                                b,
+                                                b_type,
+                                                info.b_matrix.ld(),
+                                                info.b_matrix.stride,
+                                                &beta,
+                                                c,
+                                                c_type,
+                                                info.c_matrix.ld(),
+                                                info.c_matrix.stride,
+                                                info.batch,
+                                                compute_type,
+                                                CUBLAS_GEMM_DEFAULT_TENSOR_OP); });
+    return INFINIOP_STATUS_SUCCESS;
+}
+
+infiniopStatus_t cudaMatmul(infiniopMatmulCudaDescriptor_t desc,
+                            void *workspace,
+                            uint64_t workspace_size,
+                            void *c,
+                            void const *a,
+                            void const *b,
+                            float alpha,
+                            float beta,
+                            void *stream) {
+    if (desc->dtype == INFINI_DTYPE_F16) {
+        return matmul_cuda<half>(desc, c, beta, a, b, alpha, stream);
+    }
+    if (desc->dtype == INFINI_DTYPE_F32) {
+        return matmul_cuda<float>(desc, c, beta, a, b, alpha, stream);
+    }
+    return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
+}
--- a/src/infiniop/ops/matmul/operator.cc
+++ b/src/infiniop/ops/matmul/operator.cc
+#include "../utils.h"
+#include "infiniop/ops/matmul.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/matmul_cpu.h"
+#endif
+#ifdef ENABLE_CUDA_API
+#include "cuda/matmul_cuda_api.h"
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+#include "bang/matmul_cnnl.h"
+#endif
+#ifdef ENABLE_ASCEND_NPU
+#include "ascend/matmul_aclnn.h"
+#endif
+
+__C infiniopStatus_t infiniopCreateMatmulDescriptor(infiniopHandle_t handle,
+                                                    infiniopMatmulDescriptor_t *desc_ptr,
+                                                    infiniopTensorDescriptor_t c_desc,
+                                                    infiniopTensorDescriptor_t a_desc,
+                                                    infiniopTensorDescriptor_t b_desc) {
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        case INFINI_DEVICE_CPU:
+            return cpuCreateMatmulDescriptor((infiniopCpuHandle_t) handle, (MatmulCpuDescriptor_t *) desc_ptr, c_desc, a_desc, b_desc);
+#endif
+#ifdef ENABLE_CUDA_API
+        case INFINI_DEVICE_NVIDIA: {
+            return cudaCreateMatmulDescriptor((infiniopCudaHandle_t) handle, (infiniopMatmulCudaDescriptor_t *) desc_ptr, c_desc, a_desc, b_desc);
+        }
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            return bangCreateMatmulDescriptor((BangHandle_t) handle, (MatmulBangDescriptor_t *) desc_ptr, c_desc, a_desc, b_desc);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return aclnnCreateMatmulDescriptor((AscendHandle_t) handle,
+                                               (MatmulAclnnDescriptor_t *) desc_ptr,
+                                               c_desc,
+                                               a_desc,
+                                               b_desc,
+                                               1);
+        }
+#endif
+    }
+    return INFINIOP_STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopGetMatmulWorkspaceSize(infiniopMatmulDescriptor_t desc, uint64_t *size) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU_API
+        case INFINI_DEVICE_CPU:
+            return cpuGetMatmulWorkspaceSize((MatmulCpuDescriptor_t) desc, size);
+#endif
+#ifdef ENABLE_CUDA_API
+        case INFINI_DEVICE_NVIDIA: {
+            return cudaGetMatmulWorkspaceSize((infiniopMatmulCudaDescriptor_t) desc, size);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            return bangGetMatmulWorkspaceSize((MatmulBangDescriptor_t) desc, size);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return aclnnGetMatmulWorkspaceSize((MatmulAclnnDescriptor_t) desc,
+                                               size);
+        }
+#endif
+    }
+    return INFINIOP_STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopMatmul(infiniopMatmulDescriptor_t desc, void *workspace, uint64_t workspace_size, void *c, void const *a, void const *b, float alpha, float beta, void *stream) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU_API
+        case INFINI_DEVICE_CPU:
+            return cpuMatmul((MatmulCpuDescriptor_t) desc, workspace, workspace_size, c, a, b, alpha, beta);
+#endif
+#ifdef ENABLE_CUDA_API
+        case INFINI_DEVICE_NVIDIA:
+            return cudaMatmul((infiniopMatmulCudaDescriptor_t) desc, workspace, workspace_size, c, a, b, alpha, beta, stream);
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            return bangMatmul((MatmulBangDescriptor_t) desc, workspace, workspace_size, c, alpha, a, b, beta, stream);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu:
+            return aclnnMatmul((MatmulAclnnDescriptor_t) desc,
+                               workspace,
+                               workspace_size,
+                               c,
+                               alpha,
+                               a,
+                               b,
+                               beta,
+                               stream);
+#endif
+    }
+    return INFINIOP_STATUS_BAD_DEVICE;
+}
+
+__C infiniopStatus_t infiniopDestroyMatmulDescriptor(infiniopMatmulDescriptor_t desc) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU_API
+        case INFINI_DEVICE_CPU:
+            return cpuDestroyMatmulDescriptor((MatmulCpuDescriptor_t) desc);
+#endif
+#ifdef ENABLE_CUDA_API
+        case INFINI_DEVICE_NVIDIA: {
+            return cudaDestroyMatmulDescriptor((infiniopMatmulCudaDescriptor_t) desc);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            return bangDestroyMatmulDescriptor((MatmulBangDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return aclnnDestroyMatmulDescriptor((MatmulAclnnDescriptor_t) desc);
+        }
+#endif
+    }
+    return INFINIOP_STATUS_BAD_DEVICE;
+}
--- a/src/infiniop/ops/utils.h
+++ b/src/infiniop/ops/utils.h
+#ifndef __UTILS_H__
+#define __UTILS_H__
+
+#include "infiniop/tensor_descriptor.h"
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+
+/* This file contains some useful macros and helper functions */
+
+#define ROUND_UP_DIV(x, y) ((x + y - 1) / y)
+
+#define CHECK_ERROR(call, target, errCode)                   \
+    do {                                                     \
+        if (auto value = (call); value == (target)) {        \
+            std::cerr << "Error: expected " << (target)      \
+                      << " but got " << value                \
+                      << " in file " << __FILE__             \
+                      << ", function " << __func__           \
+                      << ", line " << __LINE__ << std::endl; \
+            return (errCode);                                \
+        }                                                    \
+    } while (0)
+
+#define CREATE_CHECK_ERROR(expr, value, target, errCode) \
+    expr;                                                \
+    CHECK_ERROR(value, target, errCode)
+
+#define CHECK_STATUS(call, target)                           \
+    do {                                                     \
+        if (auto value = (call); value != (target)) {        \
+            std::cerr << "Error: expected " << (target)      \
+                      << " but got " << value                \
+                      << " in file " << __FILE__             \
+                      << ", function " << __func__           \
+                      << ", line " << __LINE__ << std::endl; \
+            return value;                                    \
+        }                                                    \
+    } while (0)
+
+inline std::vector<int64_t> get_byte_strides(infiniopTensorDescriptor_t desc) {
+    std::vector<int64_t> strides(desc->ndim);
+    for (uint64_t i = 0; i < desc->ndim; i++) {
+        strides[i] = desc->strides[i] * infini_sizeof(desc->dtype);
+    }
+
+    return strides;
+}
+
+// calculate the broadcasted shape for two tensors
+inline bool getBroadcastShape(const uint64_t *shape1, uint64_t ndim1,
+                              const uint64_t *shape2, uint64_t ndim2,
+                              uint64_t *broadcast_shape, uint64_t *padded_shape1,
+                              uint64_t *padded_shape2, uint64_t max_rank) {
+    // prepending and initializing
+    std::fill(padded_shape1, padded_shape1 + max_rank, 1);
+    std::fill(padded_shape2, padded_shape2 + max_rank, 1);
+    std::copy(shape1, shape1 + ndim1, padded_shape1 + max_rank - ndim1);
+    std::copy(shape2, shape2 + ndim2, padded_shape2 + max_rank - ndim2);
+
+    // compute broadcasted shape
+    for (size_t i = 0; i < max_rank; ++i) {
+        if (padded_shape1[i] == padded_shape2[i] || padded_shape1[i] == 1 || padded_shape2[i] == 1) {
+            broadcast_shape[i] = std::max(padded_shape1[i], padded_shape2[i]);
+        } else {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+// check if the shape of tensor c is valid after broadcasting tensors a and b and also get the broadcasted shapes
+inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a, infiniopTensorDescriptor_t b, infiniopTensorDescriptor_t c,
+                                  uint64_t broadcast_ndim) {
+    std::vector<uint64_t>
+        broadcast_shape_(broadcast_ndim),
+        padded_shape1_(broadcast_ndim),
+        padded_shape2_(broadcast_ndim);
+    auto broadcast_shape = broadcast_shape_.data(),
+         padded_shape1 = padded_shape1_.data(),
+         padded_shape2 = padded_shape2_.data();
+    if (broadcast_ndim != c->ndim || !getBroadcastShape(a->shape, a->ndim, b->shape, b->ndim, broadcast_shape, padded_shape1, padded_shape2, broadcast_ndim)) {
+        return false;
+    }
+    return std::equal(broadcast_shape, broadcast_shape + broadcast_ndim, c->shape);
+}
+
+// check if the shape of tensor src can be validly broadcasted to that of the tensor dst
+inline bool isValidBroadcastShape(infiniopTensorDescriptor_t dst, infiniopTensorDescriptor_t src) {
+    if (dst->ndim < src->ndim) {
+        return false;
+    }
+    std::vector<size_t> padded_shape_(dst->ndim);
+    auto padded_shape = padded_shape_.data();
+    std::fill(padded_shape, padded_shape + dst->ndim, 1);
+    std::copy(src->shape, src->shape + src->ndim, padded_shape + dst->ndim - src->ndim);
+    for (size_t i = 0; i < dst->ndim; ++i) {
+        if (padded_shape[i] != dst->shape[i] && padded_shape[i] != 1) {
+            return false;
+        }
+    }
+    return true;
+}
+
+// check if the shape of tensor c is valid after broadcasting tensors a and b
+inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a, infiniopTensorDescriptor_t b, infiniopTensorDescriptor_t c) {
+    return isValidBroadcastShape(a, b, c, std::max(a->ndim, b->ndim));
+}
+
+inline size_t get_byte_size(infiniopTensorDescriptor_t desc) {
+    size_t size = 1;
+    for (size_t i = 0; i < desc->ndim; i++) {
+        size *= desc->shape[i];
+    }
+    return size * infini_sizeof(desc->dtype);
+}
+
+// permute the dimensions of a tensor descriptor
+inline infiniopTensorDescriptor_t permute(infiniopTensorDescriptor_t desc, const std::vector<size_t> &order) {
+    size_t ndim = desc->ndim;
+    if (order.size() != ndim) {
+        return nullptr;
+    }
+    size_t *shape = new size_t[ndim];
+    int64_t *strides = new int64_t[ndim];
+    for (size_t i = 0; i < ndim; i++) {
+        if (std::find(order.begin(), order.end(), i) == order.end()) {
+            return nullptr;
+        }
+        shape[i] = desc->shape[order[i]];
+        strides[i] = desc->strides[order[i]];
+    }
+    return new InfiniopTensorDescriptor{
+        desc->dtype, ndim, shape, strides};
+}
+
+// check if the dimensions [dim_start, dim_end] of a tensor descriptor are contiguous
+inline bool is_contiguous(const infiniopTensorDescriptor_t &desc, size_t dim_start, size_t dim_end) {
+    for (size_t i = dim_start + 1; i <= dim_end; i++) {
+        if (desc->strides[i - 1] != static_cast<int64_t>(desc->shape[i]) * desc->strides[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+
+inline bool is_contiguous(const infiniopTensorDescriptor_t &desc) {
+    if (desc->ndim == 0) {
+        return true;
+    }
+    return is_contiguous(desc, 0, desc->ndim - 1);
+}
+
+// merge the dimensions [dim_start, dim_end] of a tensor descriptor
+inline infiniopTensorDescriptor_t dim_merge(infiniopTensorDescriptor_t desc, size_t dim_start, size_t dim_end) {
+    size_t ndim = desc->ndim;
+    if (dim_start > dim_end || dim_end >= ndim) {
+        return nullptr;
+    }
+
+    size_t new_ndim = ndim - (dim_end - dim_start);
+    size_t *new_shape = new size_t[new_ndim];
+    int64_t *new_strides = new int64_t[new_ndim];
+    size_t index = 0;
+    for (size_t i = 0; i < dim_start; i++) {
+        new_shape[index] = desc->shape[i];
+        new_strides[index] = desc->strides[i];
+        index++;
+    }
+    if (!is_contiguous(desc, dim_start, dim_end)) {
+        return nullptr;
+    }
+    new_shape[index] = 1;
+    for (size_t i = dim_start; i <= dim_end; i++) {
+        new_shape[index] *= desc->shape[i];
+    }
+    new_strides[index] = desc->strides[dim_end];
+    index++;
+    for (size_t i = dim_end + 1; i < ndim; i++) {
+        new_shape[index] = desc->shape[i];
+        new_strides[index] = desc->strides[i];
+        index++;
+    }
+    return new InfiniopTensorDescriptor{
+        desc->dtype, new_ndim, new_shape, new_strides};
+}
+
+// split the dimension dim of a tensor descriptor into multiple dimensions
+inline infiniopTensorDescriptor_t dim_split(infiniopTensorDescriptor_t desc, size_t dim, const std::vector<size_t> &dims) {
+    size_t ndim = desc->ndim;
+    if (desc->shape[dim] != std::accumulate(dims.begin(), dims.end(), (size_t)1, std::multiplies{})) {
+        return nullptr;
+    }
+    size_t new_ndim = ndim + dims.size() - 1;
+    size_t *new_shape = new size_t[new_ndim];
+    int64_t *new_strides = new int64_t[new_ndim];
+    size_t index = 0;
+    for (size_t i = 0; i < dim; i++) {
+        new_shape[index] = desc->shape[i];
+        new_strides[index] = desc->strides[i];
+        index++;
+    }
+    for (size_t i = 0; i < dims.size(); i++) {
+        new_shape[index] = dims[i];
+        new_strides[index] = desc->strides[dim] * desc->shape[dim] / std::accumulate(dims.begin(), dims.begin() + i + 1, 1, std::multiplies<size_t>());
+        index++;
+    }
+    for (size_t i = dim + 1; i < ndim; i++) {
+        new_shape[index] = desc->shape[i];
+        new_strides[index] = desc->strides[i];
+        index++;
+    }
+    return new InfiniopTensorDescriptor{
+        desc->dtype, new_ndim, new_shape, new_strides};
+}
+
+#endif// __UTILS_H__
--- a/src/infiniop/tensor_descriptor.cc
+++ b/src/infiniop/tensor_descriptor.cc
+#include "infiniop/tensor_descriptor.h"
+#include <cstring>
+
+__C __export infiniopStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescriptor_t *desc_ptr, size_t ndim, size_t const *shape_, int64_t const *strides_, infiniDtype_t datatype) {
+    size_t *shape = new size_t[ndim];
+    int64_t *strides = new int64_t[ndim];
+    std::memcpy(shape, shape_, ndim * sizeof(size_t));
+    if (strides_) {
+        std::memcpy(strides, strides_, ndim * sizeof(int64_t));
+    } else {
+        int64_t dsize = 1;
+        for (int i = ndim - 1; i >= 0; i--) {
+            strides[i] = dsize;
+            dsize *= shape[i];
+        }
+    }
+    *desc_ptr = new InfiniopTensorDescriptor{datatype, ndim, shape, strides};
+    return INFINIOP_STATUS_SUCCESS;
+}
+
+__C __export infiniopStatus_t infiniopDestroyTensorDescriptor(infiniopTensorDescriptor_t desc) {
+    delete[] desc->shape;
+    delete[] desc->strides;
+    delete desc;
+    return INFINIOP_STATUS_SUCCESS;
+}