issue/63/refactor: 重构 Matmul 所有实现，添加命名空间

Signed-off-by: YdrMaster <ydrml@hotmail.com>

issue/63/refactor: 重构 Matmul 所有实现，添加命名空间
Signed-off-by: YdrMaster <ydrml@hotmail.com>
f7137096 · YdrMaster · 8e34901e · f7137096 · f7137096 · 8e34901e
Commit f7137096 authored Feb 19, 2025 by YdrMaster
20 changed files
--- a/src/infiniop/devices/cpu/common_cpu.h
+++ b/src/infiniop/devices/cpu/common_cpu.h
@@ -2,6 +2,7 @@
 #define __INFINIOP__COMMON_CPU_H__

 #include <cmath>
+#include <cstddef>
 #include <cstdint>
 #include <cstring>
 #include <vector>

--- a/src/infiniop/devices/cuda/common_cuda.cuh
+++ b/src/infiniop/devices/cuda/common_cuda.cuh
@@ -48,7 +48,7 @@ struct InfiniopCudaHandle {
 };

 template <typename T>
-void use_cublas(std::shared_ptr<Pool<cublasHandle_t>> cublas_handle_pool, int device_id, cudaStream_t stream, T const &f) {
+void use_cublas(std::shared_ptr<Pool<cublasHandle_t>> &cublas_handle_pool, cudaStream_t stream, T const &f) {
    auto handle = cublas_handle_pool->pop();
    if (!handle) {
        cublasCreate(&(*handle));

--- a/src/infiniop/ops/matmul/ascend/matmul_aclnn.h
+++ b/src/infiniop/ops/matmul/ascend/matmul_aclnn.h
-#ifndef __ACLNN_MATMUL_H__
-#define __ACLNN_MATMUL_H__
-
-#include "../../../devices/ascend/tensor_aclnn.h"
-#include "../../utils.h"
-#include "../blas.h"
-#include "matmul_aclnn_api.h"
-#include <acl/acl_base.h>
-#include <aclnn/acl_meta.h>
-#include <aclnnop/aclnn_matmul.h>
-#include <aclnnop/level2/aclnn_gemm.h>
-
-struct InfiniopMatmulAclnnDescriptor {
-    infiniDevice_t device;
-    int device_id;
-    aclOpExecutor *executor;
-    MatmulInfo *info;
-    infiniDtype_t dtype;
-    aclnnTensorDescriptor_t cDesc, aDesc, bDesc;
-    // cubeMathType
-    // see doc:
-    // https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha002/apiref/appdevgapi/context/aclnnBatchMatMul.md
-    int8_t mt;
-    size_t workspaceSize;
-
-    InfiniopMatmulAclnnDescriptor(infiniDevice_t _device);
-};
-
-#endif
--- a/src/infiniop/ops/matmul/ascend/matmul_aclnn_api.h
+++ b/src/infiniop/ops/matmul/ascend/matmul_aclnn_api.h
-#ifndef __INFINIOP_MATMUL_ACLNN_API_H__
-#define __INFINIOP_MATMUL_ACLNN_API_H__
-#include "../../../devices/ascend/ascend_handle.h"
-#include "infiniop/operator.h"
-
-struct InfiniopMatmulAclnnDescriptor;
-typedef struct InfiniopMatmulAclnnDescriptor *MatmulAclnnDescriptor_t;
-
-infiniopStatus_t aclnnCreateMatmulDescriptor(infiniopAscendHandle_t handle,
-                                             MatmulAclnnDescriptor_t *desc_ptr,
-                                             infiniopTensorDescriptor_t c_desc,
-                                             infiniopTensorDescriptor_t a_desc,
-                                             infiniopTensorDescriptor_t b_desc,
-                                             int8_t cubeMathType);
-
-infiniopStatus_t aclnnGetMatmulWorkspaceSize(MatmulAclnnDescriptor_t desc,
-                                             size_t *size);
-
-infiniopStatus_t aclnnMatmul(MatmulAclnnDescriptor_t desc, void *workspace,
-                             size_t workspace_size, void *c, const void *a,
-                             const void *b, float alpha, float beta,
-                             void *stream);
-
-infiniopStatus_t aclnnDestroyMatmulDescriptor(MatmulAclnnDescriptor_t desc);
-#endif // __INFINIOP_MATMUL_ACLNN_API_H__
--- a/src/infiniop/ops/matmul/ascend/matmul_aclnn.cc
+++ b/src/infiniop/ops/matmul/ascend/matmul_aclnn.cc
-#include "matmul_aclnn.h"
-
-InfiniopMatmulAclnnDescriptor::InfiniopMatmulAclnnDescriptor(
-    infiniDevice_t _device) {
-    device = _device;
-    device_id = 0;
-    executor = nullptr;
-    info = nullptr;
-    cDesc = new aclnnTensorDescriptor();
-    aDesc = new aclnnTensorDescriptor();
-    bDesc = new aclnnTensorDescriptor();
-    mt = 1;
-    workspaceSize = 0;
+#include "matmul_ascend.h"
+#include "../../../devices/ascend/tensor_aclnn.h"
+#include "../../utils.h"
+#include <acl/acl_base.h>
+#include <aclnn/acl_meta.h>
+#include <aclnnop/aclnn_matmul.h>
+#include <aclnnop/level2/aclnn_gemm.h>
+
+namespace matmul::ascend {
+
+struct Descriptor::Opaque {
+    mutable aclOpExecutor *executor;
+    aclnnTensorDescriptor_t cDesc, aDesc, bDesc;
+    // cubeMathType
+    // see doc:
+    // https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha002/apiref/appdevgapi/context/aclnnBatchMatMul.md
+    int8_t mt;
+
+    ~Opaque() {
+        delete cDesc;
+        delete aDesc;
+        delete bDesc;
+        aclDestroyAclOpExecutor(executor);
+    }
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
 }

-infiniopStatus_t aclnnCreateMatmulDescriptor(infiniopAscendHandle_t handle,
-                                             MatmulAclnnDescriptor_t *desc_ptr,
-                                             infiniopTensorDescriptor_t c_desc,
-                                             infiniopTensorDescriptor_t a_desc,
-                                             infiniopTensorDescriptor_t b_desc,
-                                             int8_t mt) {
+infiniopStatus_t Descriptor::create(
+    infiniopAscendHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
    infiniDtype_t dtype = c_desc->dtype;
+
    if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
        return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
    }

-    *desc_ptr = new InfiniopMatmulAclnnDescriptor(handle->device);
-    (*desc_ptr)->device_id = handle->device_id;
-    (*desc_ptr)->dtype = dtype;
-    (*desc_ptr)->mt = mt;
    infiniopStatus_t status;
-    auto info = new MatmulInfo(c_desc, a_desc, b_desc, &status, false);
+    auto info = MatmulInfo(c_desc, a_desc, b_desc, &status, MatrixLayout::ROW_MAJOR);
    if (status != INFINIOP_STATUS_SUCCESS) {
        return status;
    }
-    (*desc_ptr)->info = info;

-    auto &cDesc = (*desc_ptr)->cDesc;
-    auto &aDesc = (*desc_ptr)->aDesc;
-    auto &bDesc = (*desc_ptr)->bDesc;
+    auto cDesc = new aclnnTensorDescriptor(),
+         aDesc = new aclnnTensorDescriptor(),
+         bDesc = new aclnnTensorDescriptor();

    // Treat A, B, C as 2D matrix, reuse aclnnTensorDescriptor for batched
    // operation
    CHECK_STATUS(cDesc->setDescriptor(
                     toAclDataType(c_desc->dtype),
-                     {static_cast<int64_t>(info->c_matrix.rows),
-                      static_cast<int64_t>(info->c_matrix.cols)},
-                     {info->c_matrix.row_stride, info->c_matrix.col_stride}),
+                     {static_cast<int64_t>(info.c_matrix.rows),
+                      static_cast<int64_t>(info.c_matrix.cols)},
+                     {info.c_matrix.row_stride, info.c_matrix.col_stride}),
                 INFINIOP_STATUS_SUCCESS);
    CHECK_STATUS(aDesc->setDescriptor(
                     toAclDataType(a_desc->dtype),
-                     {static_cast<int64_t>(info->a_matrix.rows),
-                      static_cast<int64_t>(info->a_matrix.cols)},
-                     {info->a_matrix.row_stride, info->a_matrix.col_stride}),
+                     {static_cast<int64_t>(info.a_matrix.rows),
+                      static_cast<int64_t>(info.a_matrix.cols)},
+                     {info.a_matrix.row_stride, info.a_matrix.col_stride}),
                 INFINIOP_STATUS_SUCCESS);
    CHECK_STATUS(bDesc->setDescriptor(
                     toAclDataType(b_desc->dtype),
-                     {static_cast<int64_t>(info->b_matrix.rows),
-                      static_cast<int64_t>(info->b_matrix.cols)},
-                     {info->b_matrix.row_stride, info->b_matrix.col_stride}),
+                     {static_cast<int64_t>(info.b_matrix.rows),
+                      static_cast<int64_t>(info.b_matrix.cols)},
+                     {info.b_matrix.row_stride, info.b_matrix.col_stride}),
                 INFINIOP_STATUS_SUCCESS);

    CHECK_STATUS(cDesc->createTensor(), INFINIOP_STATUS_SUCCESS);
    CHECK_STATUS(aDesc->createTensor(), INFINIOP_STATUS_SUCCESS);
    CHECK_STATUS(bDesc->createTensor(), INFINIOP_STATUS_SUCCESS);

-    auto &workspaceSize = (*desc_ptr)->workspaceSize;
-    auto &executor = (*desc_ptr)->executor;
-
-    aclTensor *tc = cDesc->t;
-    aclTensor *ta = aDesc->t;
-    aclTensor *tb = bDesc->t;
-
-    aclnnStatus ret;
-
-    int64_t transA = 0;
-    int64_t transB = 0;
+    auto tc = cDesc->t,
+         ta = aDesc->t,
+         tb = bDesc->t;
+    aclOpExecutor *executor;
+    size_t workspaceSize;
    // aclnnGemm support C = alpha * A @ B + beta * C
    // see
    // https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/aolapi/context/aclnnGemm.md
    // use alpha = 0.5, beta = 0.5 temporarily
-    ret = aclnnGemmGetWorkspaceSize(ta, tb, tc, 0.5f, 0.5f, transA, transB, tc,
-                                    (*desc_ptr)->mt, &workspaceSize, &executor);
+
+    int8_t mt = 1;
+    auto ret = aclnnGemmGetWorkspaceSize(ta, tb, tc, .5, .5, 0, 0, tc, mt, &workspaceSize, &executor);
    CHECK_RET(ret == ACL_SUCCESS,
              LOG_PRINT("aclnnGemmGetWorkspaceSize failed. ERROR: %d\n", ret);
              return INFINIOP_STATUS_INTERNAL_ERROR);
    aclSetAclOpExecutorRepeatable(executor);

+    *desc_ptr = new Descriptor(
+        dtype, info, workspaceSize,
+        new Opaque{
+            executor,
+            cDesc,
+            aDesc,
+            bDesc,
+            mt,
+        },
+        handle->device, handle->device_id);
    return INFINIOP_STATUS_SUCCESS;
 }

-infiniopStatus_t aclnnGetMatmulWorkspaceSize(MatmulAclnnDescriptor_t desc,
-                                             size_t *size) {
-    *size = desc->workspaceSize;
-    return INFINIOP_STATUS_SUCCESS;
-}
-
-infiniopStatus_t aclnnMatmul(MatmulAclnnDescriptor_t desc, void *workspace,
-                             size_t workspace_size, void *c, void const *a,
-                             void const *b, float alpha, float beta,
-                             void *stream) {
-    auto &cDesc = desc->cDesc;
-    auto &aDesc = desc->aDesc;
-    auto &bDesc = desc->bDesc;
-
-    aclTensor *tc = cDesc->t;
-    aclTensor *ta = aDesc->t;
-    aclTensor *tb = bDesc->t;
+infiniopStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspaceSize_,
+    void *c,
+    float beta,
+    void const *a,
+    void const *b,
+    float alpha,
+    void *stream) const {

-    auto batch = desc->info->batch;
+    auto tc = _opaque->cDesc->t,
+         ta = _opaque->aDesc->t,
+         tb = _opaque->bDesc->t;

    size_t workspaceSize;
-    aclnnStatus ret;
-    ret = aclnnGemmGetWorkspaceSize(ta, tb, tc, alpha, beta, 0, 0, tc, desc->mt,
-                                    &workspaceSize, &(desc->executor));
+    auto ret = aclnnGemmGetWorkspaceSize(
+        ta, tb, tc, alpha, beta, 0, 0, tc, _opaque->mt,
+        &workspaceSize, &(_opaque->executor));
    CHECK_RET(ret == ACL_SUCCESS,
              LOG_PRINT("aclnnGemmGetWorkspaceSize failed. ERROR: %d\n", ret);
              return INFINIOP_STATUS_INTERNAL_ERROR);
-    if (workspace_size < workspaceSize) {
+    if (workspaceSize_ < workspaceSize) {
        return INFINIOP_STATUS_INSUFFICIENT_WORKSPACE;
    }
-    aclSetAclOpExecutorRepeatable(desc->executor);
-
-    for (size_t i = 0; i < batch; i++) {
-        AclSetTensorAddr(desc->executor, 0, ta,
-                         (char *)(a) + i * desc->info->a_matrix.stride * infiniSizeof(desc->dtype));
-        AclSetTensorAddr(desc->executor, 1, tb,
-                         (char *)(b) + i * desc->info->b_matrix.stride * infiniSizeof(desc->dtype));
-        AclSetTensorAddr(desc->executor, 2, tc,
-                         (char *)(c) + i * desc->info->c_matrix.stride * infiniSizeof(desc->dtype));
-        AclSetTensorAddr(desc->executor, 3, tc,
-                         (char *)(c) + i * desc->info->c_matrix.stride * infiniSizeof(desc->dtype));
-        ret = aclnnGemm(workspace, workspaceSize, desc->executor, stream);
+    aclSetAclOpExecutorRepeatable(_opaque->executor);
+
+    for (size_t i = 0; i < info.batch; ++i) {
+        AclSetTensorAddr(_opaque->executor, 0, ta, ((char *)a) + i * info.a_matrix.stride * infiniSizeof(dtype));
+        AclSetTensorAddr(_opaque->executor, 1, tb, ((char *)b) + i * info.b_matrix.stride * infiniSizeof(dtype));
+        AclSetTensorAddr(_opaque->executor, 2, tc, ((char *)c) + i * info.c_matrix.stride * infiniSizeof(dtype));
+        AclSetTensorAddr(_opaque->executor, 3, tc, ((char *)c) + i * info.c_matrix.stride * infiniSizeof(dtype));
+        ret = aclnnGemm(workspace, workspaceSize, _opaque->executor, stream);
        CHECK_RET(ret == ACL_SUCCESS,
                  LOG_PRINT("aclnnGemm failed. ERROR: %d\n", ret);
                  return INFINIOP_STATUS_INTERNAL_ERROR);
@@ -139,13 +145,4 @@ infiniopStatus_t aclnnMatmul(MatmulAclnnDescriptor_t desc, void *workspace,
    return INFINIOP_STATUS_SUCCESS;
 }

-infiniopStatus_t aclnnDestroyMatmulDescriptor(MatmulAclnnDescriptor_t desc) {
-    delete desc->cDesc;
-    delete desc->bDesc;
-    delete desc->aDesc;
-    delete desc->info;
-    aclDestroyAclOpExecutor(desc->executor);
-    delete desc;
-
-    return INFINIOP_STATUS_SUCCESS;
-}
+} // namespace matmul::ascend
--- a/src/infiniop/ops/matmul/ascend/matmul_ascend.h
+++ b/src/infiniop/ops/matmul/ascend/matmul_ascend.h
+#ifndef __MATMUL_ASCEND_H__
+#define __MATMUL_ASCEND_H__
+
+#include "../../../devices/ascend/ascend_handle.h"
+#include "../matmul.h"
+
+DESCRIPTOR(ascend, infiniopAscendHandle_t)
+
+#endif // __MATMUL_ASCEND_H__
--- a/src/infiniop/ops/matmul/bang/matmul_bang.cc
+++ b/src/infiniop/ops/matmul/bang/matmul_bang.cc
+#include "matmul_bang.h"
+#include "../../../devices/bang/common_bang.h"
+#include "../../utils.h"
+#include <cnnl_extra.h>
+
+namespace matmul::bang {
+
+struct Descriptor::Opaque {
+    cnnlMatMulDescriptor_t opDesc;
+    cnnlMatMulAlgo_t algo;
+    cnnlMatMulHeuristicResult_t algoResult;
+    cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+    std::shared_ptr<Pool<cnnlHandle_t>> cnnl_handle_pool;
+
+    ~Opaque() {
+        cnnlDestroyTensorDescriptor(aDesc);
+        cnnlDestroyTensorDescriptor(bDesc);
+        cnnlDestroyTensorDescriptor(cDesc);
+        cnnlMatMulDescDestroy(opDesc);
+        cnnlMatMulAlgoDestroy(algo);
+        cnnlDestroyMatMulHeuristicResult(algoResult);
+    }
+};
+
+static void setMatrixTensorEx(
+    cnnlTensorDescriptor_t desc,
+    const BlasMatrix &matrix, infiniDtype_t dtype,
+    bool trans = false) {
+    int ndim = matrix.ndim;
+    int batch = matrix.batch;
+    int stride = static_cast<int>(matrix.stride);
+    int rows = matrix.rows;
+    int cols = matrix.cols;
+    int row_stride = matrix.row_stride;
+    int col_stride = matrix.col_stride;
+
+    switch (ndim) {
+    case 3: {
+        std::vector<int> dim_size = {batch, rows, cols};
+        std::vector<int> dim_stride = {stride, row_stride, col_stride};
+        cnnlSetTensorDescriptorEx(
+            desc, CNNL_LAYOUT_ARRAY,
+            cnnlDataTypeConvert(dtype), dim_size.size(),
+            dim_size.data(), dim_stride.data());
+    } break;
+    case 2: {
+        std::vector<int> dim_size = {rows, cols};
+        std::vector<int> dim_stride = {row_stride, col_stride};
+        cnnlSetTensorDescriptorEx(
+            desc, CNNL_LAYOUT_ARRAY,
+            cnnlDataTypeConvert(dtype), dim_size.size(),
+            dim_size.data(), dim_stride.data());
+    } break;
+    }
+}
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniopStatus_t Descriptor::create(
+    infiniopBangHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+    infiniDtype_t dtype = c_desc->dtype;
+
+    if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
+        return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    infiniopStatus_t status;
+    auto info = MatmulInfo(c_desc, a_desc, b_desc, &status, MatrixLayout::ROW_MAJOR);
+    if (status != INFINIOP_STATUS_SUCCESS) {
+        return status;
+    }
+
+    cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+    cnnlCreateTensorDescriptor(&aDesc);
+    cnnlCreateTensorDescriptor(&bDesc);
+    cnnlCreateTensorDescriptor(&cDesc);
+
+    setMatrixTensorEx(aDesc, info.a_matrix, a_desc->dtype);
+    setMatrixTensorEx(bDesc, info.b_matrix, b_desc->dtype);
+    setMatrixTensorEx(cDesc, info.c_matrix, c_desc->dtype);
+
+    cnnlMatMulDescriptor_t opDesc;
+    cnnlMatMulAlgo_t algo;
+    cnnlMatMulHeuristicResult_t algoResult;
+    cnnlMatMulDescCreate(&opDesc);
+    cnnlMatMulAlgoCreate(&algo);
+    cnnlCreateMatMulHeuristicResult(&algoResult);
+    int32_t use_stride = true;
+    cnnlSetMatMulDescAttr(
+        opDesc,
+        CNNL_MATMUL_USE_STRIDE,
+        &use_stride,
+        sizeof(int32_t));
+    int count = 0;
+    use_cnnl(handle->cnnl_handle_pool,
+             [&](cnnlHandle_t _handle) {
+                 cnnlGetBatchMatMulAlgoHeuristic(
+                     _handle,
+                     opDesc, aDesc, bDesc, cDesc,
+                     NULL, 1, &algoResult, &count);
+             });
+
+    size_t workspace_size;
+    cnnlGetBatchMatMulHeuristicResult(algoResult, algo, &workspace_size);
+
+    *desc_ptr = new Descriptor(
+        dtype, info, workspace_size,
+        new Opaque{
+            opDesc,
+            algo,
+            algoResult,
+            aDesc,
+            bDesc,
+            cDesc,
+            handle->cnnl_handle_pool},
+        handle->device, handle->device_id);
+    return INFINIOP_STATUS_SUCCESS;
+}
+
+infiniopStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    float beta,
+    void const *a,
+    void const *b,
+    float alpha,
+    void *stream) const {
+
+    if (info.is_transed) {
+        std::swap(a, b);
+    }
+    use_cnnl(_opaque->cnnl_handle_pool,
+             (cnrtQueue_t)stream,
+             [&](cnnlHandle_t handle) {
+                 cnnlBatchMatMulBCast_v2(
+                     handle,
+                     _opaque->opDesc,
+                     _opaque->algo,
+                     &alpha,
+                     _opaque->aDesc, a,
+                     _opaque->bDesc, b,
+                     &beta,
+                     _opaque->cDesc, c,
+                     workspace,
+                     workspace_size);
+             });
+    cnrtQueueSync((cnrtQueue_t)stream);
+
+    return INFINIOP_STATUS_SUCCESS;
+}
+
+} // namespace matmul::bang
--- a/src/infiniop/ops/matmul/bang/matmul_bang.h
+++ b/src/infiniop/ops/matmul/bang/matmul_bang.h
+#ifndef __MATMUL_BANG_H__
+#define __MATMUL_BANG_H__
+
+#include "../../../devices/bang/bang_handle.h"
+#include "../matmul.h"
+
+DESCRIPTOR(bang, infiniopBangHandle_t)
+
+#endif // __MATMUL_BANG_H__
--- a/src/infiniop/ops/matmul/bang/matmul_cnnl.cc
+++ b/src/infiniop/ops/matmul/bang/matmul_cnnl.cc
-#include "matmul_cnnl.h"
-#include "../../../devices/bang/common_bang.h"
-#include "../../utils.h"
-#include "matmul_cnnl_api.h"
-
-infiniopStatus_t bangCreateMatmulDescriptor(
-    infiniopBangHandle_t handle, infiniopMatmulBangDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc,
-    infiniopTensorDescriptor_t b_desc) {
-    infiniopStatus_t status;
-    auto info = MatmulInfo(c_desc, a_desc, b_desc, &status, false);
-    if (status != INFINIOP_STATUS_SUCCESS) {
-        return status;
-    }
-    cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
-    cnnlCreateTensorDescriptor(&aDesc);
-    cnnlCreateTensorDescriptor(&bDesc);
-    cnnlCreateTensorDescriptor(&cDesc);
-
-    setMatrixTensorEx(aDesc, info.a_matrix, a_desc->dtype);
-    setMatrixTensorEx(bDesc, info.b_matrix, b_desc->dtype);
-    setMatrixTensorEx(cDesc, info.c_matrix, c_desc->dtype);
-
-    cnnlMatMulDescriptor_t opDesc;
-    cnnlMatMulAlgo_t algo;
-    cnnlMatMulHeuristicResult_t algoResult;
-    cnnlMatMulDescCreate(&opDesc);
-    cnnlMatMulAlgoCreate(&algo);
-    cnnlCreateMatMulHeuristicResult(&algoResult);
-    int32_t use_stride = true;
-    cnnlSetMatMulDescAttr(opDesc, CNNL_MATMUL_USE_STRIDE, &use_stride,
-                          sizeof(int32_t));
-    int count = 0;
-    use_cnnl(handle->cnnl_handle_pool, [&](cnnlHandle_t _handle) {
-        cnnlGetBatchMatMulAlgoHeuristic(_handle, opDesc, aDesc, bDesc, cDesc,
-                                        NULL, 1, &algoResult, &count);
-    });
-
-    size_t workspace_size;
-    cnnlGetBatchMatMulHeuristicResult(algoResult, algo, &workspace_size);
-    *desc_ptr = new InfiniopMatmulBangDescriptor{handle->device,
-                                                 handle->device_id,
-                                                 info,
-                                                 c_desc->dtype,
-                                                 handle->cnnl_handle_pool,
-                                                 aDesc,
-                                                 bDesc,
-                                                 cDesc,
-                                                 opDesc,
-                                                 algo,
-                                                 algoResult,
-                                                 workspace_size};
-
-    return INFINIOP_STATUS_SUCCESS;
-}
-infiniopStatus_t bangGetMatmulWorkspaceSize(infiniopMatmulBangDescriptor_t desc,
-                                            size_t *size) {
-    *size = desc->workspace_size;
-    return INFINIOP_STATUS_SUCCESS;
-}
-
-infiniopStatus_t
-bangDestroyMatmulDescriptor(infiniopMatmulBangDescriptor_t desc) {
-    desc->cnnl_handle_pool = nullptr;
-    cnnlDestroyTensorDescriptor(desc->aDesc);
-    cnnlDestroyTensorDescriptor(desc->bDesc);
-    cnnlDestroyTensorDescriptor(desc->cDesc);
-    cnnlMatMulDescDestroy(desc->opDesc);
-    cnnlMatMulAlgoDestroy(desc->algo);
-    cnnlDestroyMatMulHeuristicResult(desc->algoResult);
-    delete desc;
-    return INFINIOP_STATUS_SUCCESS;
-}
-
-void bangMatmulCnnl(infiniopMatmulBangDescriptor_t desc, void *workspace, void *c,
-                    float beta, void const *a, void const *b, float alpha,
-                    void *stream) {
-    auto info = desc->info;
-    if (info.is_transed) {
-        std::swap(a, b);
-    }
-
-    use_cnnl(desc->cnnl_handle_pool, (cnrtQueue_t)stream, [&](cnnlHandle_t handle) {
-        cnnlBatchMatMulBCast_v2(handle, desc->opDesc, desc->algo, &alpha,
-                                desc->aDesc, a, desc->bDesc, b, &beta,
-                                desc->cDesc, c, workspace,
-                                desc->workspace_size);
-    });
-}
-infiniopStatus_t bangMatmul(infiniopMatmulBangDescriptor_t desc,
-                            void *workspace, size_t workspace_size, void *c,
-                            void const *a, void const *b, float alpha,
-                            float beta, void *stream) {
-    if (desc->dtype == INFINI_DTYPE_F16 || desc->dtype == INFINI_DTYPE_F32) {
-        bangMatmulCnnl(desc, workspace, c, beta, a, b, alpha, stream);
-        cnrtQueueSync((cnrtQueue_t)stream);
-        return INFINIOP_STATUS_SUCCESS;
-    }
-    return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
-}
--- a/src/infiniop/ops/matmul/bang/matmul_cnnl.h
+++ b/src/infiniop/ops/matmul/bang/matmul_cnnl.h
-#ifndef __CNNL_MATMUL_H__
-#define __CNNL_MATMUL_H__
-#include "../../../devices/bang/common_bang.h"
-#include "../blas.h"
-#include "cnnl_extra.h"
-
-struct InfiniopMatmulBangDescriptor {
-    infiniDevice_t device;
-    int device_id;
-    MatmulInfo info;
-    infiniDtype_t dtype;
-    std::shared_ptr<Pool<cnnlHandle_t>> cnnl_handle_pool;
-    cnnlTensorDescriptor_t aDesc;
-    cnnlTensorDescriptor_t bDesc;
-    cnnlTensorDescriptor_t cDesc;
-    cnnlMatMulDescriptor_t opDesc;
-    cnnlMatMulAlgo_t algo;
-    cnnlMatMulHeuristicResult_t algoResult;
-    size_t workspace_size;
-};
-
-inline void setMatrixTensorEx(cnnlTensorDescriptor_t desc,
-                              const BlasMatrix &matrix, infiniDtype_t dtype,
-                              bool trans = false) {
-    int ndim = matrix.ndim;
-    int batch = matrix.batch;
-    int stride = static_cast<int>(matrix.stride);
-    int rows = matrix.rows;
-    int cols = matrix.cols;
-    int row_stride = matrix.row_stride;
-    int col_stride = matrix.col_stride;
-
-    if (ndim == 3) {
-        std::vector<int> dim_size = {batch, rows, cols};
-        std::vector<int> dim_stride = {stride, row_stride, col_stride};
-        cnnlSetTensorDescriptorEx(desc, CNNL_LAYOUT_ARRAY,
-                                  cnnlDataTypeConvert(dtype), dim_size.size(),
-                                  dim_size.data(), dim_stride.data());
-    } else if (ndim == 2) {
-        std::vector<int> dim_size = {rows, cols};
-        std::vector<int> dim_stride = {row_stride, col_stride};
-        cnnlSetTensorDescriptorEx(desc, CNNL_LAYOUT_ARRAY,
-                                  cnnlDataTypeConvert(dtype), dim_size.size(),
-                                  dim_size.data(), dim_stride.data());
-    }
-}
-
-#endif // __CNNL_MATMUL_H__
--- a/src/infiniop/ops/matmul/bang/matmul_cnnl_api.h
+++ b/src/infiniop/ops/matmul/bang/matmul_cnnl_api.h
-#ifndef __CNNL_MATMUL_API_H__
-#define __CNNL_MATMUL_API_H__
-
-#include "../../../devices/bang/bang_handle.h"
-#include "infiniop/operator.h"
-
-struct InfiniopMatmulBangDescriptor;
-typedef struct InfiniopMatmulBangDescriptor *infiniopMatmulBangDescriptor_t;
-
-infiniopStatus_t bangCreateMatmulDescriptor(
-    infiniopBangHandle_t handle, infiniopMatmulBangDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc,
-    infiniopTensorDescriptor_t b_desc);
-
-infiniopStatus_t bangGetMatmulWorkspaceSize(infiniopMatmulBangDescriptor_t desc,
-                                            size_t *size);
-
-infiniopStatus_t bangMatmul(infiniopMatmulBangDescriptor_t desc,
-                            void *workspace, size_t workspace_size, void *c,
-                            void const *a, void const *b, float alpha,
-                            float beta, void *stream);
-
-infiniopStatus_t
-bangDestroyMatmulDescriptor(infiniopMatmulBangDescriptor_t desc);
-
-#endif
--- a/src/infiniop/ops/matmul/blas.h
+++ b/src/infiniop/ops/matmul/blas.h
 #ifndef __BLAS_H__
 #define __BLAS_H__

-#include "../utils.h"
 #include "infiniop/operator.h"
 #include <algorithm>
-#include <stdint.h>

-typedef struct BlasMatrix {
+namespace matmul {
+struct BlasMatrix {
    size_t ndim;
    size_t batch;
    ptrdiff_t stride;
@@ -15,31 +14,31 @@ typedef struct BlasMatrix {
    ptrdiff_t row_stride;
    ptrdiff_t col_stride;

-    BlasMatrix() {}
+    BlasMatrix() = default;

    BlasMatrix(infiniopTensorDescriptor_t layout, infiniopStatus_t *status) {
        if (layout->ndim == 2) {
-            this->ndim = 2;
-            this->batch = 1;
-            this->stride = 0;
-            this->rows = layout->shape[0];
-            this->cols = layout->shape[1];
-            this->row_stride = layout->strides[0];
-            this->col_stride = layout->strides[1];
+            ndim = 2;
+            batch = 1;
+            stride = 0;
+            rows = layout->shape[0];
+            cols = layout->shape[1];
+            row_stride = layout->strides[0];
+            col_stride = layout->strides[1];
        } else if (layout->ndim == 3) {
-            this->ndim = 3;
-            this->batch = layout->shape[0];
-            this->stride = this->batch == 1 ? 0 : layout->strides[0];
-            this->rows = layout->shape[1];
-            this->cols = layout->shape[2];
-            this->row_stride = layout->strides[1];
-            this->col_stride = layout->strides[2];
+            ndim = 3;
+            batch = layout->shape[0];
+            stride = batch == 1 ? 0 : layout->strides[0];
+            rows = layout->shape[1];
+            cols = layout->shape[2];
+            row_stride = layout->strides[1];
+            col_stride = layout->strides[2];
        } else {
            *status = INFINIOP_STATUS_BAD_TENSOR_SHAPE;
            return;
        }

-        if (this->row_stride != 1 && this->col_stride != 1) {
+        if (row_stride != 1 && col_stride != 1) {
            *status = INFINIOP_STATUS_BAD_TENSOR_STRIDES;
            return;
        }
@@ -48,7 +47,7 @@ typedef struct BlasMatrix {
    }

    bool match_batch(size_t _batch) const {
-        return this->batch == _batch || this->batch == 1;
+        return batch == _batch || batch == 1;
    }

    void transpose() {
@@ -57,13 +56,14 @@ typedef struct BlasMatrix {
    }

    ptrdiff_t ld() const {
-        if (this->row_stride == 1) {
-            return this->col_stride;
-        } else {
-            return this->row_stride;
-        }
+        return row_stride == 1 ? col_stride : row_stride;
    }
-} BlasMatrix;
+};
+
+enum class MatrixLayout : uint8_t {
+    COL_MAJOR,
+    ROW_MAJOR,
+};

 struct MatmulInfo {
    BlasMatrix a_matrix;
@@ -74,7 +74,11 @@ struct MatmulInfo {

    bool is_transed = false;

-    MatmulInfo(infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t b_desc, infiniopStatus_t *status, bool col_major = true) {
+    MatmulInfo(infiniopTensorDescriptor_t c_desc,
+               infiniopTensorDescriptor_t a_desc,
+               infiniopTensorDescriptor_t b_desc,
+               infiniopStatus_t *status,
+               MatrixLayout layout) {
        a_matrix = BlasMatrix(a_desc, status);
        if (*status != INFINIOP_STATUS_SUCCESS) {
            return;
@@ -99,7 +103,8 @@ struct MatmulInfo {
            return;
        }

-        if ((col_major && c_matrix.col_stride == 1) || (!col_major && c_matrix.row_stride == 1)) {
+        if ((layout == MatrixLayout::COL_MAJOR && c_matrix.col_stride == 1)
+            || (layout == MatrixLayout::ROW_MAJOR && c_matrix.row_stride == 1)) {
            c_matrix.transpose();
            b_matrix.transpose();
            a_matrix.transpose();
@@ -112,5 +117,6 @@ struct MatmulInfo {
        k = a_matrix.cols;
    }
 };
+} // namespace matmul

 #endif // __BLAS_H__
--- a/src/infiniop/ops/matmul/cpu/matmul_cpu.cc
+++ b/src/infiniop/ops/matmul/cpu/matmul_cpu.cc
 #include "./matmul_cpu.h"
 #include "../../../devices/cpu/common_cpu.h"
-#include "../../utils.h"
-#include <cmath>
+#include <iostream>

-infiniopStatus_t cpuCreateMatmulDescriptor(
-    infiniopCpuHandle_t handle, infiniopMatmulCpuDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc,
+namespace matmul::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniopStatus_t Descriptor::create(
+    infiniopCpuHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
    infiniopTensorDescriptor_t b_desc) {
    infiniDtype_t dtype = c_desc->dtype;

@@ -14,32 +19,26 @@ infiniopStatus_t cpuCreateMatmulDescriptor(
    }

    infiniopStatus_t status;
-    auto info = MatmulInfo(c_desc, a_desc, b_desc, &status);
+    auto info = MatmulInfo(c_desc, a_desc, b_desc, &status, MatrixLayout::COL_MAJOR);
    if (status != INFINIOP_STATUS_SUCCESS) {
        return status;
    }

-    *desc_ptr = new MatmulCpuDescriptor{INFINI_DEVICE_CPU, dtype, info};
-
-    return INFINIOP_STATUS_SUCCESS;
-}
-
-infiniopStatus_t cpuGetMatmulWorkspaceSize(infiniopMatmulCpuDescriptor_t desc,
-                                           size_t *size) {
-    *size = 0;
-    return INFINIOP_STATUS_SUCCESS;
-}
-
-infiniopStatus_t
-cpuDestroyMatmulDescriptor(infiniopMatmulCpuDescriptor_t desc) {
-    delete desc;
+    *desc_ptr = new Descriptor(
+        dtype, info, 0,
+        nullptr,
+        handle->device, handle->device_id);
    return INFINIOP_STATUS_SUCCESS;
 }

 template <typename Tdata>
-infiniopStatus_t cpuCalculateMatmul(infiniopMatmulCpuDescriptor_t desc, void *c,
-                                    float beta, void const *a, void const *b,
-                                    float alpha) {
+void calculate(
+    Descriptor const *desc,
+    void *c,
+    float beta,
+    void const *a,
+    void const *b,
+    float alpha) {
    auto info = desc->info;

    if (info.is_transed) {
@@ -72,17 +71,30 @@ infiniopStatus_t cpuCalculateMatmul(infiniopMatmulCpuDescriptor_t desc, void *c,
            }
        }
    }
-    return INFINIOP_STATUS_SUCCESS;
 }

-infiniopStatus_t cpuMatmul(infiniopMatmulCpuDescriptor_t desc, void *workspace,
-                           size_t workspace_size, void *c, void const *a,
-                           void const *b, float alpha, float beta) {
-    if (desc->dtype == INFINI_DTYPE_F16) {
-        return cpuCalculateMatmul<uint16_t>(desc, c, beta, a, b, alpha);
-    }
-    if (desc->dtype == INFINI_DTYPE_F32) {
-        return cpuCalculateMatmul<float>(desc, c, beta, a, b, alpha);
+infiniopStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    float beta,
+    void const *a,
+    void const *b,
+    float alpha,
+    void *stream) const {
+
+    switch (dtype) {
+    case INFINI_DTYPE_F16:
+        cpu::calculate<uint16_t>(this, c, beta, a, b, alpha);
+        return INFINIOP_STATUS_SUCCESS;
+
+    case INFINI_DTYPE_F32:
+        cpu::calculate<float>(this, c, beta, a, b, alpha);
+        return INFINIOP_STATUS_SUCCESS;
+
+    default:
+        return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
    }
-    return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
 }
+
+} // namespace matmul::cpu
--- a/src/infiniop/ops/matmul/cpu/matmul_cpu.h
+++ b/src/infiniop/ops/matmul/cpu/matmul_cpu.h
-#ifndef __INFINIOP_MATMUL_CPU_H__
-#define __INFINIOP_MATMUL_CPU_H__
+#ifndef __MATMUL_CPU_H__
+#define __MATMUL_CPU_H__

-#include "../blas.h"
-#include "./matmul_cpu_api.h"
+#include "../../../devices/cpu/cpu_handle.h"
+#include "../matmul.h"

-typedef struct MatmulCpuDescriptor {
-    infiniDevice_t device;
-    infiniDtype_t dtype;
-    MatmulInfo info;
-} MatmulCpuDescriptor;
+DESCRIPTOR(cpu, infiniopCpuHandle_t)

-#endif // __INFINIOP_MATMUL_CPU_H__
+#endif // __MATMUL_CPU_H__
--- a/src/infiniop/ops/matmul/cpu/matmul_cpu_api.h
+++ b/src/infiniop/ops/matmul/cpu/matmul_cpu_api.h
-#ifndef __INFINIOP_MATMUL_CPU_API_H__
-#define __INFINIOP_MATMUL_CPU_API_H__
-
-#include "../../../devices/cpu/cpu_handle.h"
-#include "infiniop/operator.h"
-
-struct MatmulCpuDescriptor;
-
-typedef struct MatmulCpuDescriptor *infiniopMatmulCpuDescriptor_t;
-
-infiniopStatus_t cpuCreateMatmulDescriptor(
-    infiniopCpuHandle_t handle, infiniopMatmulCpuDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc,
-    infiniopTensorDescriptor_t b_desc);
-
-infiniopStatus_t cpuGetMatmulWorkspaceSize(infiniopMatmulCpuDescriptor_t desc,
-                                           size_t *size);
-
-infiniopStatus_t cpuMatmul(infiniopMatmulCpuDescriptor_t desc, void *workspace,
-                           size_t workspace_size, void *c, void const *a,
-                           void const *b, float alpha, float beta);
-
-infiniopStatus_t cpuDestroyMatmulDescriptor(infiniopMatmulCpuDescriptor_t desc);
-
-#endif // __INFINIOP_MATMUL_CPU_API_H__
--- a/src/infiniop/ops/matmul/cuda/matmul_cuda.cu
+++ b/src/infiniop/ops/matmul/cuda/matmul_cuda.cu
 #include "../../utils.h"
-#include "./matmul_cuda.cuh"
+#include "matmul_cuda.cuh"

-infiniopStatus_t cudaCreateMatmulDescriptor(infiniopCudaHandle_t handle,
-                                            infiniopMatmulCudaDescriptor_t *desc_ptr,
-                                            infiniopTensorDescriptor_t c_desc,
-                                            infiniopTensorDescriptor_t a_desc,
-                                            infiniopTensorDescriptor_t b_desc) {
+namespace matmul::cuda {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<Pool<cublasHandle_t>> cublas_handle_pool;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniopStatus_t Descriptor::create(
+    infiniopCudaHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
    infiniDtype_t dtype = c_desc->dtype;

    if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
@@ -13,27 +24,103 @@ infiniopStatus_t cudaCreateMatmulDescriptor(infiniopCudaHandle_t handle,
    }

    infiniopStatus_t status;
-    auto info = MatmulInfo(c_desc, a_desc, b_desc, &status);
+    auto info = MatmulInfo(c_desc, a_desc, b_desc, &status, MatrixLayout::COL_MAJOR);
    if (status != INFINIOP_STATUS_SUCCESS) {
        return status;
    }

-    *desc_ptr = new InfiniopMatmulCudaDescriptor{
-        handle->device,
-        dtype,
-        handle->device_id,
-        info,
-        handle->cublas_handle_pool};
+    *desc_ptr = new Descriptor(
+        dtype, info, 0,
+        new Opaque{handle->cublas_handle_pool},
+        handle->device, handle->device_id);
    return INFINIOP_STATUS_SUCCESS;
 }

-infiniopStatus_t cudaGetMatmulWorkspaceSize(infiniopMatmulCudaDescriptor_t desc, size_t *size) {
-    *size = 0;
+template <typename Tdata>
+infiniopStatus_t calculate(
+    MatmulInfo const &info,
+    std::shared_ptr<Pool<cublasHandle_t>> &cublas_handle_pool,
+    void *c,
+    float beta,
+    void const *a,
+    void const *b,
+    float alpha,
+    cudaStream_t stream) {
+
+    if (info.is_transed) {
+        std::swap(a, b);
+    }
+
+    cudaDataType a_type, b_type, c_type;
+    cublasComputeType_t compute_type;
+    if constexpr (std::is_same<Tdata, half>::value) {
+        a_type = b_type = c_type = CUDA_R_16F;
+        compute_type = CUBLAS_COMPUTE_32F;
+    } else {
+        a_type = b_type = c_type = CUDA_R_32F;
+#ifdef ENABLE_SUGON_CUDA_API
+        compute_type = CUBLAS_COMPUTE_32F;
+#else
+        compute_type = CUBLAS_COMPUTE_32F_FAST_TF32;
+#endif
+    }
+
+    auto op_a = info.a_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
+    auto op_b = info.b_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+    use_cublas(cublas_handle_pool,
+               stream,
+               [&](cublasHandle_t handle) {
+                   cublasGemmStridedBatchedEx(
+                       handle,
+                       op_a,
+                       op_b,
+                       static_cast<int>(info.m),
+                       static_cast<int>(info.n),
+                       static_cast<int>(info.k),
+                       &alpha,
+                       a,
+                       a_type,
+                       static_cast<int>(info.a_matrix.ld()),
+                       info.a_matrix.stride,
+                       b,
+                       b_type,
+                       static_cast<int>(info.b_matrix.ld()),
+                       info.b_matrix.stride,
+                       &beta,
+                       c,
+                       c_type,
+                       static_cast<int>(info.c_matrix.ld()),
+                       info.c_matrix.stride,
+                       static_cast<int>(info.batch),
+                       compute_type,
+                       CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+               });
    return INFINIOP_STATUS_SUCCESS;
 }

-infiniopStatus_t cudaDestroyMatmulDescriptor(infiniopMatmulCudaDescriptor_t desc) {
-    desc->cublas_handle_pool = nullptr;
-    delete desc;
-    return INFINIOP_STATUS_SUCCESS;
+infiniopStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    float beta,
+    void const *a,
+    void const *b,
+    float alpha,
+    void *stream) const {
+
+    switch (dtype) {
+    case INFINI_DTYPE_F16:
+        cuda::calculate<uint16_t>(info, _opaque->cublas_handle_pool, c, beta, a, b, alpha, (cudaStream_t)stream);
+        return INFINIOP_STATUS_SUCCESS;
+
+    case INFINI_DTYPE_F32:
+        cuda::calculate<float>(info, _opaque->cublas_handle_pool, c, beta, a, b, alpha, (cudaStream_t)stream);
+        return INFINIOP_STATUS_SUCCESS;
+
+    default:
+        return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
+    }
 }
+
+} // namespace matmul::cuda
--- a/src/infiniop/ops/matmul/cuda/matmul_cuda.cuh
+++ b/src/infiniop/ops/matmul/cuda/matmul_cuda.cuh
-#ifndef __INFINIOP_MATMUL_CUDA_H__
-#define __INFINIOP_MATMUL_CUDA_H__
+#ifndef __MATMUL_CUDA_CUH__
+#define __MATMUL_CUDA_CUH__

-#include "../../../devices/cuda/common_cuda.cuh"
-#include "../blas.h"
-#include "matmul_cuda_api.h"
-#include <memory>
+#include "../../../devices/cuda/cuda_handle.h"
+#include "../matmul.h"

-typedef struct InfiniopMatmulCudaDescriptor {
-    infiniDevice_t device;
-    infiniDtype_t dtype;
-    int device_id;
-    MatmulInfo info;
-    std::shared_ptr<Pool<cublasHandle_t>> cublas_handle_pool;
-} InfiniopMatmulCudaDescriptor;
+DESCRIPTOR(cuda, infiniopCudaHandle_t)

-#endif // __INFINIOP_MATMUL_CUDA_H__
+#endif // __MATMUL_CUDA_CUH__
--- a/src/infiniop/ops/matmul/cuda/matmul_cuda_api.h
+++ b/src/infiniop/ops/matmul/cuda/matmul_cuda_api.h
-#ifndef __INFINIOP_MATMUL_CUDA_API_H__
-#define __INFINIOP_MATMUL_CUDA_API_H__
-
-#include "../../../devices/cuda/cuda_handle.h"
-#include "infiniop/operator.h"
-
-struct InfiniopMatmulCudaDescriptor;
-typedef struct InfiniopMatmulCudaDescriptor *infiniopMatmulCudaDescriptor_t;
-
-infiniopStatus_t cudaCreateMatmulDescriptor(infiniopCudaHandle_t handle,
-                                            infiniopMatmulCudaDescriptor_t *desc_ptr,
-                                            infiniopTensorDescriptor_t c_desc,
-                                            infiniopTensorDescriptor_t a_desc,
-                                            infiniopTensorDescriptor_t b_desc);
-
-infiniopStatus_t cudaGetMatmulWorkspaceSize(infiniopMatmulCudaDescriptor_t desc, size_t *size);
-
-infiniopStatus_t cudaMatmul(infiniopMatmulCudaDescriptor_t desc,
-                            void *workspace,
-                            size_t workspace_size,
-                            void *c,
-                            void const *a,
-                            void const *b,
-                            float alpha,
-                            float beta,
-                            void *stream);
-
-infiniopStatus_t cudaDestroyMatmulDescriptor(infiniopMatmulCudaDescriptor_t desc);
-
-#endif // __INFINIOP_MATMUL_CUDA_API_H__
--- a/src/infiniop/ops/matmul/cuda/matmul_cuda_kernel.cu
+++ b/src/infiniop/ops/matmul/cuda/matmul_cuda_kernel.cu
-#include "../../utils.h"
-#include "./matmul_cuda.cuh"
-
-template <typename Tdata>
-infiniopStatus_t cudaMatmulCublas(infiniopMatmulCudaDescriptor_t desc, void *c, float beta, void const *a, void const *b, float alpha, void *stream) {
-    auto info = desc->info;
-
-    if (info.is_transed) {
-        std::swap(a, b);
-    }
-
-    cudaDataType a_type, b_type, c_type;
-    cublasComputeType_t compute_type;
-    if constexpr (std::is_same<Tdata, half>::value) {
-        a_type = b_type = c_type = CUDA_R_16F;
-        compute_type = CUBLAS_COMPUTE_32F;
-    } else {
-        a_type = b_type = c_type = CUDA_R_32F;
-#ifdef ENABLE_SUGON_CUDA_API
-        compute_type = CUBLAS_COMPUTE_32F;
-#else
-        compute_type = CUBLAS_COMPUTE_32F_FAST_TF32;
-#endif
-    }
-
-    auto op_a = info.a_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
-    auto op_b = info.b_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
-
-    use_cublas(desc->cublas_handle_pool, desc->device_id, (cudaStream_t)stream,
-               [&](cublasHandle_t handle) { cublasGemmStridedBatchedEx(
-                                                handle,
-                                                op_a,
-                                                op_b,
-                                                static_cast<int>(info.m),
-                                                static_cast<int>(info.n),
-                                                static_cast<int>(info.k),
-                                                &alpha,
-                                                a,
-                                                a_type,
-                                                static_cast<int>(info.a_matrix.ld()),
-                                                info.a_matrix.stride,
-                                                b,
-                                                b_type,
-                                                static_cast<int>(info.b_matrix.ld()),
-                                                info.b_matrix.stride,
-                                                &beta,
-                                                c,
-                                                c_type,
-                                                static_cast<int>(info.c_matrix.ld()),
-                                                info.c_matrix.stride,
-                                                static_cast<int>(info.batch),
-                                                compute_type,
-                                                CUBLAS_GEMM_DEFAULT_TENSOR_OP); });
-    return INFINIOP_STATUS_SUCCESS;
-}
-
-infiniopStatus_t cudaMatmul(infiniopMatmulCudaDescriptor_t desc,
-                            void *workspace,
-                            size_t workspace_size,
-                            void *c,
-                            void const *a,
-                            void const *b,
-                            float alpha,
-                            float beta,
-                            void *stream) {
-    if (desc->dtype == INFINI_DTYPE_F16) {
-        return cudaMatmulCublas<half>(desc, c, beta, a, b, alpha, stream);
-    }
-    if (desc->dtype == INFINI_DTYPE_F32) {
-        return cudaMatmulCublas<float>(desc, c, beta, a, b, alpha, stream);
-    }
-    return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
-}
--- a/src/infiniop/ops/matmul/matmul.h
+++ b/src/infiniop/ops/matmul/matmul.h
+#ifndef __MATMUL_H__
+#define __MATMUL_H__
+
+#include "blas.h"
+#include "infiniop/operator.h"
+
+#define DESCRIPTOR(NAMESPACE, HANDLE)                     \
+                                                          \
+    namespace matmul::NAMESPACE {                         \
+    class Descriptor final : public InfiniopDescriptor {  \
+        struct Opaque;                                    \
+        Opaque *_opaque;                                  \
+                                                          \
+        Descriptor(                                       \
+            infiniDtype_t dtype_,                         \
+            MatmulInfo info_,                             \
+            size_t workspace_size_,                       \
+            Opaque *opaque,                               \
+            infiniDevice_t device_type,                   \
+            int device_id)                                \
+            : InfiniopDescriptor{device_type, device_id}, \
+              _opaque(opaque),                            \
+              dtype(dtype_),                              \
+              info(info_),                                \
+              workspace_size(workspace_size_) {}          \
+                                                          \
+    public:                                               \
+        infiniDtype_t dtype;                              \
+        MatmulInfo info;                                  \
+        size_t workspace_size;                            \
+                                                          \
+        ~Descriptor();                                    \
+                                                          \
+        static infiniopStatus_t create(                   \
+            HANDLE handle,                                \
+            Descriptor **desc_ptr,                        \
+            infiniopTensorDescriptor_t c_desc,            \
+            infiniopTensorDescriptor_t a_desc,            \
+            infiniopTensorDescriptor_t b_desc);           \
+                                                          \
+        infiniopStatus_t calculate(                       \
+            void *workspace,                              \
+            size_t workspace_size,                        \
+            void *c,                                      \
+            float beta,                                   \
+            void const *a,                                \
+            void const *b,                                \
+            float alpha,                                  \
+            void *stream) const;                          \
+    };                                                    \
+    }
+
+#endif // __MATMUL_H__