Merge pull request #65 from YdrMaster/main

issue/63 重构算子定义文件结构及风格修改

Merge pull request #65 from YdrMaster/main
issue/63 重构算子定义文件结构及风格修改
e77735ef · PanZezhong1725 · GitHub · b7893d65 · 3144cc9c · e77735ef
Unverified Commit e77735ef authored Feb 25, 2025 by PanZezhong1725 Committed by GitHub Feb 25, 2025
20 changed files
--- a/include/infiniop/handle.h
+++ b/include/infiniop/handle.h
@@ -2,7 +2,7 @@
 #define __INFINIOP_HANDLE__

 #include "../infinicore.h"
-#include "./status.h"
+#include "status.h"

 typedef struct InfiniopHandle {
    infiniDevice_t device;

--- a/include/infiniop/operator.h
+++ b/include/infiniop/operator.h
 #ifndef __INFINIOP_OPERATOR___
 #define __INFINIOP_OPERATOR___

-#include "./handle.h"
-#include "./tensor_descriptor.h"
+#include "handle.h"
+#include "tensor_descriptor.h"

 // Base descriptor for all operators
 typedef struct InfiniopDescriptor {
-    infiniDevice_t device;
+    infiniDevice_t device_type;
    int device_id;
 } InfiniopDescriptor;

+__C __export infiniopStatus_t infiniopGetDescriptorDeviceType(const InfiniopDescriptor *desc_ptr, infiniDevice_t *device_type);
+__C __export infiniopStatus_t infiniopGetDescriptorDeviceId(const InfiniopDescriptor *desc_ptr, int *device_id);
+
 #endif //__INFINIOP_OPERATOR___
--- a/include/infiniop/ops/attention.h
+++ b/include/infiniop/ops/attention.h
@@ -2,8 +2,8 @@
 #define __INFINIOP_ATTENTION_H__

 #include "../operator.h"
-#include "./matmul.h"
-#include "./swiglu.h"
+#include "matmul.h"
+#include "swiglu.h"

 typedef InfiniopDescriptor *infiniopAttentionDescriptor_t;

@@ -23,9 +23,9 @@ __C __export infiniopStatus_t infiniopAttention(infiniopAttentionDescriptor_t de
                                                void *workspace,
                                                size_t workspace_size,
                                                void *out,
-                                                void const *q,
-                                                void const *k,
-                                                void const *v,
+                                                const void *q,
+                                                const void *k,
+                                                const void *v,
                                                void *k_cache,
                                                void *v_cache,
                                                void *stream);

--- a/include/infiniop/ops/mlp.h
+++ b/include/infiniop/ops/mlp.h
@@ -2,8 +2,8 @@
 #define __INFINIOP_MLP_H__

 #include "../operator.h"
-#include "./matmul.h"
-#include "./swiglu.h"
+#include "matmul.h"
+#include "swiglu.h"

 typedef InfiniopDescriptor *infiniopMLPDescriptor_t;

@@ -22,9 +22,9 @@ __C __export infiniopStatus_t infiniopMLP(infiniopMLPDescriptor_t desc,
                                          void *workspace,
                                          size_t workspace_size,
                                          void *y,
-                                          void const *x,
-                                          void const *w12,
-                                          void const *w3,
+                                          const void *x,
+                                          const void *w12,
+                                          const void *w3,
                                          void *stream);

 __C __export infiniopStatus_t infiniopDestroyMLPDescriptor(infiniopMLPDescriptor_t desc);

--- a/include/infiniop/tensor_descriptor.h
+++ b/include/infiniop/tensor_descriptor.h
@@ -2,7 +2,7 @@
 #define __INFINIOP_TENSOR_DESCRIPTOR__

 #include "../infinicore.h"
-#include "./status.h"
+#include "status.h"

 struct InfiniopTensorDescriptor {
    // Datatype
@@ -17,7 +17,7 @@ struct InfiniopTensorDescriptor {

 typedef struct InfiniopTensorDescriptor *infiniopTensorDescriptor_t;

-__C __export infiniopStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescriptor_t *desc_ptr, size_t ndim, size_t const *shape, ptrdiff_t const *strides, infiniDtype_t dtype);
+__C __export infiniopStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescriptor_t *desc_ptr, size_t ndim, const size_t *shape, const ptrdiff_t *strides, infiniDtype_t dtype);

 __C __export infiniopStatus_t infiniopDestroyTensorDescriptor(infiniopTensorDescriptor_t desc);


--- a/scripts/format.py
+++ b/scripts/format.py
@@ -91,7 +91,7 @@ def git_added_files():
    try:
        # 使用 git diff --cached --name-only 获取所有已添加到暂存区的文件
        result = subprocess.run(
-            ["git", "diff", "--cached", "--name-only"],
+            ["git", "diff", "--cached", "--diff-filter=AMR", "--name-only"],
            capture_output=True,
            text=True,
            check=True,
@@ -162,7 +162,7 @@ def main():

    if args.ref is None and args.path is None:
        # Last commit.
-        print("{Fore.GREEN}Formating git added files.{Style.RESET_ALL}")
+        print(f"{Fore.GREEN}Formating git added files.{Style.RESET_ALL}")
        files = git_added_files()

    else:

--- a/src/infiniop/devices/ascend/tensor_aclnn.h
+++ b/src/infiniop/devices/ascend/tensor_aclnn.h
 #ifndef __ACLNN_TENSOR__
 #define __ACLNN_TENSOR__

-#include "./common_ascend.h"
+#include "common_ascend.h"
 #include "infiniop/operator.h"
 #include <acl/acl.h>
 #include <acl/acl_base.h>

--- a/src/infiniop/devices/cpu/common_cpu.cc
+++ b/src/infiniop/devices/cpu/common_cpu.cc
-#include "./common_cpu.h"
+#include "common_cpu.h"

 float f16_to_f32(uint16_t h) {
    uint32_t sign = (h & 0x8000) << 16;
@@ -59,9 +59,11 @@ uint16_t f32_to_f16(float val) {
    }
 }

-size_t indexToReducedOffset(size_t flat_index, size_t ndim,
-                            ptrdiff_t const *broadcasted_strides,
-                            ptrdiff_t const *target_strides) {
+size_t indexToReducedOffset(
+    size_t flat_index,
+    size_t ndim,
+    const ptrdiff_t *broadcasted_strides,
+    const ptrdiff_t *target_strides) {
    size_t res = 0;
    for (size_t i = 0; i < ndim; ++i) {
        res += flat_index / broadcasted_strides[i] * target_strides[i];
@@ -70,8 +72,11 @@ size_t indexToReducedOffset(size_t flat_index, size_t ndim,
    return res;
 }

-size_t indexToOffset(size_t flat_index, size_t ndim, size_t const *shape,
-                     ptrdiff_t const *strides) {
+size_t indexToOffset(
+    size_t flat_index,
+    size_t ndim,
+    const size_t *shape,
+    const ptrdiff_t *strides) {
    size_t res = 0;
    for (size_t i = ndim; i-- >= 0;) {
        res += (flat_index % shape[i]) * strides[i];
@@ -80,7 +85,10 @@ size_t indexToOffset(size_t flat_index, size_t ndim, size_t const *shape,
    return res;
 }

-size_t getPaddedSize(size_t ndim, size_t *shape, size_t const *pads) {
+size_t getPaddedSize(
+    size_t ndim,
+    size_t *shape,
+    const size_t *pads) {
    size_t total_size = 1;
    for (size_t i = 0; i < ndim; ++i) {
        total_size *= shape[i] + (i < 2 ? 0 : 2 * pads[i - 2]);
@@ -88,8 +96,10 @@ size_t getPaddedSize(size_t ndim, size_t *shape, size_t const *pads) {
    return total_size;
 }

-std::vector<size_t> getPaddedShape(size_t ndim, size_t const *shape,
-                                   size_t const *pads) {
+std::vector<size_t> getPaddedShape(
+    size_t ndim,
+    const size_t *shape,
+    const size_t *pads) {
    std::vector<size_t> padded_shape(ndim);
    memcpy(padded_shape.data(), shape, ndim * sizeof(size_t));
    for (size_t i = 2; i < ndim; ++i) {

--- a/src/infiniop/devices/cpu/common_cpu.h
+++ b/src/infiniop/devices/cpu/common_cpu.h
-#ifndef __INFINIOP__COMMON_CPU_H__
-#define __INFINIOP__COMMON_CPU_H__
+#ifndef __INFINIOP_COMMON_CPU_H__
+#define __INFINIOP_COMMON_CPU_H__

 #include <cmath>
+#include <cstddef>
 #include <cstdint>
 #include <cstring>
 #include <vector>
@@ -13,18 +14,18 @@ float f16_to_f32(uint16_t code);
 uint16_t f32_to_f16(float val);

 // return the memory offset of original tensor, given the flattened index of broadcasted tensor
-size_t indexToReducedOffset(size_t flat_index, size_t ndim, ptrdiff_t const *broadcasted_strides, ptrdiff_t const *target_strides);
+size_t indexToReducedOffset(size_t flat_index, size_t ndim, const ptrdiff_t *broadcasted_strides, const ptrdiff_t *target_strides);

 // return the memory offset a tensor given flattened index
-size_t indexToOffset(size_t flat_index, size_t ndim, size_t const *shape, ptrdiff_t const *strides);
+size_t indexToOffset(size_t flat_index, size_t ndim, const size_t *shape, const ptrdiff_t *strides);

 /**
 * get the total array size (element count) after applying padding for a
 * ndim-ary tensor with the given shape
 */
-size_t getPaddedSize(size_t ndim, size_t *shape, size_t const *pads);
+size_t getPaddedSize(size_t ndim, size_t *shape, const size_t *pads);

 // calculate the padded shape and store the result in padded_shape
-std::vector<size_t> getPaddedShape(size_t ndim, size_t const *shape, size_t const *pads);
+std::vector<size_t> getPaddedShape(size_t ndim, const size_t *shape, const size_t *pads);

 #endif // __INFINIOP__COMMON_CPU_H__
--- a/src/infiniop/devices/cpu/cpu_handle.cc
+++ b/src/infiniop/devices/cpu/cpu_handle.cc
-#include "./cpu_handle.h"
+#include "cpu_handle.h"

 infiniopStatus_t createCpuHandle(infiniopCpuHandle_t *handle_ptr) {
    *handle_ptr = new InfiniopHandle{INFINI_DEVICE_CPU, 0};

--- a/src/infiniop/devices/cuda/common_cuda.cuh
+++ b/src/infiniop/devices/cuda/common_cuda.cuh
@@ -48,26 +48,25 @@ struct InfiniopCudaHandle {
 };

 template <typename T>
-void use_cublas(std::shared_ptr<Pool<cublasHandle_t>> cublas_handle_pool, int device_id, cudaStream_t stream, T const &f) {
-    auto handle = cublas_handle_pool->pop();
+void use_cublas(std::shared_ptr<Pool<cublasHandle_t>> &pool, cudaStream_t stream, const T &f) {
+    auto handle = pool->pop();
    if (!handle) {
        cublasCreate(&(*handle));
    }
-    cublasSetStream(*handle, (cudaStream_t)stream);
+    cublasSetStream(*handle, stream);
    f(*handle);
-    cublas_handle_pool->push(std::move(*handle));
+    pool->push(std::move(*handle));
 }

 template <typename T>
-cudnnStatus_t use_cudnn(std::shared_ptr<Pool<cudnnHandle_t>> cudnn_handle_pool, int device_id, cudaStream_t stream, T const &f) {
-    auto handle = cudnn_handle_pool->pop();
+void use_cudnn(std::shared_ptr<Pool<cudnnHandle_t>> &pool, cudaStream_t stream, const T &f) {
+    auto handle = pool->pop();
    if (!handle) {
        cudnnCreate(&(*handle));
    }
    cudnnSetStream(*handle, stream);
-    cudnnStatus_t status = f(*handle);
-    cudnn_handle_pool->push(std::move(*handle));
-    return status;
+    f(*handle);
+    pool->push(std::move(*handle));
 }

 inline cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
@@ -96,8 +95,10 @@ inline cudnnDataType_t getCudnnDtype(infiniDtype_t dt) {
 // return the memory offset of original tensor, given the flattened index of
 // broadcasted tensor
 inline __device__ __host__ size_t indexToReducedOffset(
-    size_t flat_index, size_t ndim, ptrdiff_t const *broadcasted_strides,
-    ptrdiff_t const *target_strides) {
+    size_t flat_index,
+    size_t ndim,
+    const ptrdiff_t *broadcasted_strides,
+    const ptrdiff_t *target_strides) {
    size_t res = 0;
    for (size_t i = 0; i < ndim; ++i) {
        res += flat_index / broadcasted_strides[i] * target_strides[i];
@@ -107,9 +108,11 @@ inline __device__ __host__ size_t indexToReducedOffset(
 }

 // get the memory offset of the given element in a tensor given its flat index
-inline __device__ __host__ size_t indexToOffset(size_t flat_index, size_t ndim,
-                                                size_t const *shape,
-                                                ptrdiff_t const *strides) {
+inline __device__ __host__ size_t indexToOffset(
+    size_t flat_index,
+    size_t ndim,
+    const size_t *shape,
+    const ptrdiff_t *strides) {
    size_t res = 0;
    for (size_t i = ndim; i-- > 0;) {
        res += (flat_index % shape[i]) * strides[i];

--- a/src/infiniop/devices/cuda/cuda_handle.cu
+++ b/src/infiniop/devices/cuda/cuda_handle.cu
-#include "./common_cuda.cuh"
+#include "common_cuda.cuh"

 infiniopStatus_t createCudaHandle(infiniopCudaHandle_t *handle_ptr, infiniDevice_t cuda_device_type) {
    // Create a new cublas handle pool

--- a/src/infiniop/devices/handle.cc
+++ b/src/infiniop/devices/handle.cc
 #include "infiniop/handle.h"
 #ifdef ENABLE_CPU_API
-#include "./cpu/cpu_handle.h"
+#include "cpu/cpu_handle.h"
 #endif
 #ifdef ENABLE_CUDA_API
-#include "./cuda/cuda_handle.h"
+#include "cuda/cuda_handle.h"
 #endif
 #ifdef ENABLE_CAMBRICON_API
-#include "./bang/bang_handle.h"
+#include "bang/bang_handle.h"
 #endif
 #ifdef ENABLE_ASCEND_API
-#include "./ascend/ascend_handle.h"
+#include "ascend/ascend_handle.h"
 #endif

 __C infiniopStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr,

--- a/src/infiniop/operator.cc
+++ b/src/infiniop/operator.cc
+#include "infiniop/operator.h"
+
+infiniopStatus_t infiniopGetDescriptorDeviceType(
+    const InfiniopDescriptor *desc_ptr,
+    infiniDevice_t *device_type) {
+    *device_type = desc_ptr->device_type;
+    return INFINIOP_STATUS_SUCCESS;
+}
+
+infiniopStatus_t infiniopGetDescriptorDeviceId(
+    const InfiniopDescriptor *desc_ptr,
+    int *device_id) {
+    *device_id = desc_ptr->device_id;
+    return INFINIOP_STATUS_SUCCESS;
+}
--- a/src/infiniop/ops/causal_softmax/operator.cc
+++ b/src/infiniop/ops/causal_softmax/operator.cc
@@ -41,7 +41,7 @@ __C infiniopStatus_t infiniopCreateCausalSoftmaxDescriptor(
 }

 __C infiniopStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDescriptor_t desc, size_t *size) {
-    switch (desc->device) {
+    switch (desc->device_type) {
 #ifdef ENABLE_CPU
    case DevCpu:
        return cpuGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCpuDescriptor_t)desc, size);
@@ -79,7 +79,7 @@ __C infiniopStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmax
 }

 __C infiniopStatus_t infiniopCausalSoftmax(infiniopCausalSoftmaxDescriptor_t desc, void *workspace, size_t workspace_size, void *data, void *stream) {
-    switch (desc->device) {
+    switch (desc->device_type) {
 #ifdef ENABLE_CPU
    case DevCpu:
        return cpuCausalSoftmax((CausalSoftmaxCpuDescriptor_t)desc, workspace, workspace_size, data, stream);
@@ -116,7 +116,7 @@ __C infiniopStatus_t infiniopCausalSoftmax(infiniopCausalSoftmaxDescriptor_t des
 }

 __C infiniopStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxDescriptor_t desc) {
-    switch (desc->device) {
+    switch (desc->device_type) {
 #ifdef ENABLE_CPU
    case DevCpu:
        return cpuDestroyCausalSoftmaxDescriptor((CausalSoftmaxCpuDescriptor_t)desc);

--- a/src/infiniop/ops/matmul/ascend/matmul_aclnn.cc
+++ b/src/infiniop/ops/matmul/ascend/matmul_aclnn.cc
-#include "matmul_aclnn.h"
-
-InfiniopMatmulAclnnDescriptor::InfiniopMatmulAclnnDescriptor(
-    infiniDevice_t _device) {
-    device = _device;
-    device_id = 0;
-    executor = nullptr;
-    info = nullptr;
-    cDesc = new aclnnTensorDescriptor();
-    aDesc = new aclnnTensorDescriptor();
-    bDesc = new aclnnTensorDescriptor();
-    mt = 1;
-    workspaceSize = 0;
-}
-
-infiniopStatus_t aclnnCreateMatmulDescriptor(infiniopAscendHandle_t handle,
-                                             MatmulAclnnDescriptor_t *desc_ptr,
-                                             infiniopTensorDescriptor_t c_desc,
-                                             infiniopTensorDescriptor_t a_desc,
-                                             infiniopTensorDescriptor_t b_desc,
-                                             int8_t mt) {
-    infiniDtype_t dtype = c_desc->dtype;
-    if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
-        return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    *desc_ptr = new InfiniopMatmulAclnnDescriptor(handle->device);
-    (*desc_ptr)->device_id = handle->device_id;
-    (*desc_ptr)->dtype = dtype;
-    (*desc_ptr)->mt = mt;
-    infiniopStatus_t status;
-    auto info = new MatmulInfo(c_desc, a_desc, b_desc, &status, false);
-    if (status != INFINIOP_STATUS_SUCCESS) {
-        return status;
-    }
-    (*desc_ptr)->info = info;
-
-    auto &cDesc = (*desc_ptr)->cDesc;
-    auto &aDesc = (*desc_ptr)->aDesc;
-    auto &bDesc = (*desc_ptr)->bDesc;
-
-    // Treat A, B, C as 2D matrix, reuse aclnnTensorDescriptor for batched
-    // operation
-    CHECK_STATUS(cDesc->setDescriptor(
-                     toAclDataType(c_desc->dtype),
-                     {static_cast<int64_t>(info->c_matrix.rows),
-                      static_cast<int64_t>(info->c_matrix.cols)},
-                     {info->c_matrix.row_stride, info->c_matrix.col_stride}),
-                 INFINIOP_STATUS_SUCCESS);
-    CHECK_STATUS(aDesc->setDescriptor(
-                     toAclDataType(a_desc->dtype),
-                     {static_cast<int64_t>(info->a_matrix.rows),
-                      static_cast<int64_t>(info->a_matrix.cols)},
-                     {info->a_matrix.row_stride, info->a_matrix.col_stride}),
-                 INFINIOP_STATUS_SUCCESS);
-    CHECK_STATUS(bDesc->setDescriptor(
-                     toAclDataType(b_desc->dtype),
-                     {static_cast<int64_t>(info->b_matrix.rows),
-                      static_cast<int64_t>(info->b_matrix.cols)},
-                     {info->b_matrix.row_stride, info->b_matrix.col_stride}),
-                 INFINIOP_STATUS_SUCCESS);
-
-    CHECK_STATUS(cDesc->createTensor(), INFINIOP_STATUS_SUCCESS);
-    CHECK_STATUS(aDesc->createTensor(), INFINIOP_STATUS_SUCCESS);
-    CHECK_STATUS(bDesc->createTensor(), INFINIOP_STATUS_SUCCESS);
-
-    auto &workspaceSize = (*desc_ptr)->workspaceSize;
-    auto &executor = (*desc_ptr)->executor;
-
-    aclTensor *tc = cDesc->t;
-    aclTensor *ta = aDesc->t;
-    aclTensor *tb = bDesc->t;
-
-    aclnnStatus ret;
-
-    int64_t transA = 0;
-    int64_t transB = 0;
-    // aclnnGemm support C = alpha * A @ B + beta * C
-    // see
-    // https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/aolapi/context/aclnnGemm.md
-    // use alpha = 0.5, beta = 0.5 temporarily
-    ret = aclnnGemmGetWorkspaceSize(ta, tb, tc, 0.5f, 0.5f, transA, transB, tc,
-                                    (*desc_ptr)->mt, &workspaceSize, &executor);
-    CHECK_RET(ret == ACL_SUCCESS,
-              LOG_PRINT("aclnnGemmGetWorkspaceSize failed. ERROR: %d\n", ret);
-              return INFINIOP_STATUS_INTERNAL_ERROR);
-    aclSetAclOpExecutorRepeatable(executor);
-
-    return INFINIOP_STATUS_SUCCESS;
-}
-
-infiniopStatus_t aclnnGetMatmulWorkspaceSize(MatmulAclnnDescriptor_t desc,
-                                             size_t *size) {
-    *size = desc->workspaceSize;
-    return INFINIOP_STATUS_SUCCESS;
-}
-
-infiniopStatus_t aclnnMatmul(MatmulAclnnDescriptor_t desc, void *workspace,
-                             size_t workspace_size, void *c, void const *a,
-                             void const *b, float alpha, float beta,
-                             void *stream) {
-    auto &cDesc = desc->cDesc;
-    auto &aDesc = desc->aDesc;
-    auto &bDesc = desc->bDesc;
-
-    aclTensor *tc = cDesc->t;
-    aclTensor *ta = aDesc->t;
-    aclTensor *tb = bDesc->t;
-
-    auto batch = desc->info->batch;
-
-    size_t workspaceSize;
-    aclnnStatus ret;
-    ret = aclnnGemmGetWorkspaceSize(ta, tb, tc, alpha, beta, 0, 0, tc, desc->mt,
-                                    &workspaceSize, &(desc->executor));
-    CHECK_RET(ret == ACL_SUCCESS,
-              LOG_PRINT("aclnnGemmGetWorkspaceSize failed. ERROR: %d\n", ret);
-              return INFINIOP_STATUS_INTERNAL_ERROR);
-    if (workspace_size < workspaceSize) {
-        return INFINIOP_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-    aclSetAclOpExecutorRepeatable(desc->executor);
-
-    for (size_t i = 0; i < batch; i++) {
-        AclSetTensorAddr(desc->executor, 0, ta,
-                         (char *)(a) + i * desc->info->a_matrix.stride * infiniSizeof(desc->dtype));
-        AclSetTensorAddr(desc->executor, 1, tb,
-                         (char *)(b) + i * desc->info->b_matrix.stride * infiniSizeof(desc->dtype));
-        AclSetTensorAddr(desc->executor, 2, tc,
-                         (char *)(c) + i * desc->info->c_matrix.stride * infiniSizeof(desc->dtype));
-        AclSetTensorAddr(desc->executor, 3, tc,
-                         (char *)(c) + i * desc->info->c_matrix.stride * infiniSizeof(desc->dtype));
-        ret = aclnnGemm(workspace, workspaceSize, desc->executor, stream);
-        CHECK_RET(ret == ACL_SUCCESS,
-                  LOG_PRINT("aclnnGemm failed. ERROR: %d\n", ret);
-                  return INFINIOP_STATUS_INTERNAL_ERROR);
-    }
-
-    return INFINIOP_STATUS_SUCCESS;
-}
-
-infiniopStatus_t aclnnDestroyMatmulDescriptor(MatmulAclnnDescriptor_t desc) {
-    delete desc->cDesc;
-    delete desc->bDesc;
-    delete desc->aDesc;
-    delete desc->info;
-    aclDestroyAclOpExecutor(desc->executor);
-    delete desc;
-
-    return INFINIOP_STATUS_SUCCESS;
-}
--- a/src/infiniop/ops/matmul/ascend/matmul_aclnn.h
+++ b/src/infiniop/ops/matmul/ascend/matmul_aclnn.h
-#ifndef __ACLNN_MATMUL_H__
-#define __ACLNN_MATMUL_H__
-
-#include "../../../devices/ascend/tensor_aclnn.h"
-#include "../../utils.h"
-#include "../blas.h"
-#include "matmul_aclnn_api.h"
-#include <acl/acl_base.h>
-#include <aclnn/acl_meta.h>
-#include <aclnnop/aclnn_matmul.h>
-#include <aclnnop/level2/aclnn_gemm.h>
-
-struct InfiniopMatmulAclnnDescriptor {
-    infiniDevice_t device;
-    int device_id;
-    aclOpExecutor *executor;
-    MatmulInfo *info;
-    infiniDtype_t dtype;
-    aclnnTensorDescriptor_t cDesc, aDesc, bDesc;
-    // cubeMathType
-    // see doc:
-    // https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha002/apiref/appdevgapi/context/aclnnBatchMatMul.md
-    int8_t mt;
-    size_t workspaceSize;
-
-    InfiniopMatmulAclnnDescriptor(infiniDevice_t _device);
-};
-
-#endif
--- a/src/infiniop/ops/matmul/ascend/matmul_aclnn_api.h
+++ b/src/infiniop/ops/matmul/ascend/matmul_aclnn_api.h
-#ifndef __INFINIOP_MATMUL_ACLNN_API_H__
-#define __INFINIOP_MATMUL_ACLNN_API_H__
-#include "../../../devices/ascend/ascend_handle.h"
-#include "infiniop/operator.h"
-
-struct InfiniopMatmulAclnnDescriptor;
-typedef struct InfiniopMatmulAclnnDescriptor *MatmulAclnnDescriptor_t;
-
-infiniopStatus_t aclnnCreateMatmulDescriptor(infiniopAscendHandle_t handle,
-                                             MatmulAclnnDescriptor_t *desc_ptr,
-                                             infiniopTensorDescriptor_t c_desc,
-                                             infiniopTensorDescriptor_t a_desc,
-                                             infiniopTensorDescriptor_t b_desc,
-                                             int8_t cubeMathType);
-
-infiniopStatus_t aclnnGetMatmulWorkspaceSize(MatmulAclnnDescriptor_t desc,
-                                             size_t *size);
-
-infiniopStatus_t aclnnMatmul(MatmulAclnnDescriptor_t desc, void *workspace,
-                             size_t workspace_size, void *c, const void *a,
-                             const void *b, float alpha, float beta,
-                             void *stream);
-
-infiniopStatus_t aclnnDestroyMatmulDescriptor(MatmulAclnnDescriptor_t desc);
-#endif // __INFINIOP_MATMUL_ACLNN_API_H__
--- a/src/infiniop/ops/matmul/ascend/matmul_ascend.cc
+++ b/src/infiniop/ops/matmul/ascend/matmul_ascend.cc
+#include "matmul_ascend.h"
+#include "../../../devices/ascend/ascend_handle.h"
+#include "../../../devices/ascend/tensor_aclnn.h"
+#include "../../utils.h"
+#include <acl/acl_base.h>
+#include <aclnn/acl_meta.h>
+#include <aclnnop/aclnn_matmul.h>
+#include <aclnnop/level2/aclnn_gemm.h>
+
+namespace matmul::ascend {
+
+struct Descriptor::Opaque {
+    mutable aclOpExecutor *executor;
+    aclnnTensorDescriptor_t c, a, b;
+    // cubeMathType
+    // see doc:
+    // https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha002/apiref/appdevgapi/context/aclnnBatchMatMul.md
+    int8_t mt;
+
+    ~Opaque() {
+        delete c;
+        delete a;
+        delete b;
+        aclDestroyAclOpExecutor(executor);
+    }
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniopStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+    auto handle = reinterpret_cast<infiniopAscendHandle_t>(handle_);
+    auto dtype = c_desc->dtype;
+
+    if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
+        return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    infiniopStatus_t status;
+    auto info = MatmulInfo(c_desc, a_desc, b_desc, &status, MatrixLayout::ROW_MAJOR);
+    if (status != INFINIOP_STATUS_SUCCESS) {
+        return status;
+    }
+
+    auto c = new aclnnTensorDescriptor(),
+         a = new aclnnTensorDescriptor(),
+         b = new aclnnTensorDescriptor();
+
+    // Treat A, B, C as 2D matrix, reuse aclnnTensorDescriptor for batched
+    // operation
+    CHECK_STATUS(c->setDescriptor(
+                     toAclDataType(c_desc->dtype),
+                     {static_cast<int64_t>(info.c_matrix.rows),
+                      static_cast<int64_t>(info.c_matrix.cols)},
+                     {info.c_matrix.row_stride, info.c_matrix.col_stride}),
+                 INFINIOP_STATUS_SUCCESS);
+    CHECK_STATUS(a->setDescriptor(
+                     toAclDataType(a_desc->dtype),
+                     {static_cast<int64_t>(info.a_matrix.rows),
+                      static_cast<int64_t>(info.a_matrix.cols)},
+                     {info.a_matrix.row_stride, info.a_matrix.col_stride}),
+                 INFINIOP_STATUS_SUCCESS);
+    CHECK_STATUS(b->setDescriptor(
+                     toAclDataType(b_desc->dtype),
+                     {static_cast<int64_t>(info.b_matrix.rows),
+                      static_cast<int64_t>(info.b_matrix.cols)},
+                     {info.b_matrix.row_stride, info.b_matrix.col_stride}),
+                 INFINIOP_STATUS_SUCCESS);
+
+    CHECK_STATUS(c->createTensor(), INFINIOP_STATUS_SUCCESS);
+    CHECK_STATUS(a->createTensor(), INFINIOP_STATUS_SUCCESS);
+    CHECK_STATUS(b->createTensor(), INFINIOP_STATUS_SUCCESS);
+
+    auto tc = c->t,
+         ta = a->t,
+         tb = b->t;
+    aclOpExecutor *executor;
+    size_t workspace_size;
+    // aclnnGemm support C = alpha * A @ B + beta * C
+    // see
+    // https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/aolapi/context/aclnnGemm.md
+    // use alpha = 0.5, beta = 0.5 temporarily
+
+    int8_t mt = 1;
+    auto ret = aclnnGemmGetWorkspaceSize(ta, tb, tc, .5, .5, 0, 0, tc, mt, &workspace_size, &executor);
+    CHECK_RET(ret == ACL_SUCCESS,
+              LOG_PRINT("aclnnGemmGetWorkspaceSize failed. ERROR: %d\n", ret);
+              return INFINIOP_STATUS_INTERNAL_ERROR);
+    aclSetAclOpExecutorRepeatable(executor);
+
+    *desc_ptr = new Descriptor(
+        dtype, info, workspace_size,
+        new Opaque{
+            executor,
+            c,
+            a,
+            b,
+            mt,
+        },
+        handle->device, handle->device_id);
+    return INFINIOP_STATUS_SUCCESS;
+}
+
+infiniopStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspaceSize_,
+    void *c,
+    float beta,
+    const void *a,
+    const void *b,
+    float alpha,
+    void *stream) const {
+
+    auto tc = _opaque->c->t,
+         ta = _opaque->a->t,
+         tb = _opaque->b->t;
+
+    size_t workspace_size;
+    auto ret = aclnnGemmGetWorkspaceSize(
+        ta, tb, tc, alpha, beta, 0, 0, tc, _opaque->mt,
+        &workspace_size, &(_opaque->executor));
+    CHECK_RET(ret == ACL_SUCCESS,
+              LOG_PRINT("aclnnGemmGetWorkspaceSize failed. ERROR: %d\n", ret);
+              return INFINIOP_STATUS_INTERNAL_ERROR);
+    if (workspaceSize_ < workspace_size) {
+        return INFINIOP_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    aclSetAclOpExecutorRepeatable(_opaque->executor);
+
+    auto unit = infiniSizeof(_dtype);
+    for (size_t i = 0; i < _info.batch; ++i) {
+        AclSetTensorAddr(_opaque->executor, 0, ta, ((char *)a) + i * _info.a_matrix.stride * unit);
+        AclSetTensorAddr(_opaque->executor, 1, tb, ((char *)b) + i * _info.b_matrix.stride * unit);
+        AclSetTensorAddr(_opaque->executor, 2, tc, ((char *)c) + i * _info.c_matrix.stride * unit);
+        AclSetTensorAddr(_opaque->executor, 3, tc, ((char *)c) + i * _info.c_matrix.stride * unit);
+        ret = aclnnGemm(workspace, workspace_size, _opaque->executor, stream);
+        CHECK_RET(ret == ACL_SUCCESS,
+                  LOG_PRINT("aclnnGemm failed. ERROR: %d\n", ret);
+                  return INFINIOP_STATUS_INTERNAL_ERROR);
+    }
+
+    return INFINIOP_STATUS_SUCCESS;
+}
+
+} // namespace matmul::ascend
--- a/src/infiniop/ops/matmul/ascend/matmul_ascend.h
+++ b/src/infiniop/ops/matmul/ascend/matmul_ascend.h
+#ifndef __MATMUL_ASCEND_H__
+#define __MATMUL_ASCEND_H__
+
+#include "../matmul.h"
+
+DESCRIPTOR(ascend)
+
+#endif // __MATMUL_ASCEND_H__