issue/63/style: 整理代码，优化风格

Signed-off-by: YdrMaster <ydrml@hotmail.com>

issue/63/style: 整理代码，优化风格
Signed-off-by: YdrMaster <ydrml@hotmail.com>
52536c5a · YdrMaster · f23aa206 · 52536c5a · 52536c5a · 52536c5a
Commit 52536c5a authored Feb 20, 2025 by YdrMaster
10 changed files
--- a/src/infiniop/ops/matmul/ascend/matmul_ascend.cc
+++ b/src/infiniop/ops/matmul/ascend/matmul_ascend.cc
 #include "matmul_ascend.h"
+#include "../../../devices/ascend/ascend_handle.h"
 #include "../../../devices/ascend/tensor_aclnn.h"
 #include "../../utils.h"
 #include <acl/acl_base.h>
@@ -10,16 +11,16 @@ namespace matmul::ascend {

 struct Descriptor::Opaque {
    mutable aclOpExecutor *executor;
-    aclnnTensorDescriptor_t cDesc, aDesc, bDesc;
+    aclnnTensorDescriptor_t c, a, b;
    // cubeMathType
    // see doc:
    // https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha002/apiref/appdevgapi/context/aclnnBatchMatMul.md
    int8_t mt;

    ~Opaque() {
-        delete cDesc;
-        delete aDesc;
-        delete bDesc;
+        delete c;
+        delete a;
+        delete b;
        aclDestroyAclOpExecutor(executor);
    }
 };
@@ -29,76 +30,77 @@ Descriptor::~Descriptor() {
 }

 infiniopStatus_t Descriptor::create(
-    infiniopAscendHandle_t handle,
+    infiniopHandle_t handle_,
    Descriptor **desc_ptr,
    infiniopTensorDescriptor_t c_desc,
    infiniopTensorDescriptor_t a_desc,
    infiniopTensorDescriptor_t b_desc) {
-    infiniDtype_t dtype = c_desc->dtype;
+    auto handle = reinterpret_cast<infiniopAscendHandle_t>(handle_);
+    auto dtype = c_desc->dtype;

    if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
        return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
    }

    infiniopStatus_t status;
-    auto info = MatmulInfo(c_desc, a_desc, b_desc, &status, MatrixLayout::ROW_MAJOR);
+    auto _info = MatmulInfo(c_desc, a_desc, b_desc, &status, MatrixLayout::ROW_MAJOR);
    if (status != INFINIOP_STATUS_SUCCESS) {
        return status;
    }

-    auto cDesc = new aclnnTensorDescriptor(),
-         aDesc = new aclnnTensorDescriptor(),
-         bDesc = new aclnnTensorDescriptor();
+    auto c = new aclnnTensorDescriptor(),
+         a = new aclnnTensorDescriptor(),
+         b = new aclnnTensorDescriptor();

    // Treat A, B, C as 2D matrix, reuse aclnnTensorDescriptor for batched
    // operation
-    CHECK_STATUS(cDesc->setDescriptor(
+    CHECK_STATUS(c->setDescriptor(
                     toAclDataType(c_desc->dtype),
-                     {static_cast<int64_t>(info.c_matrix.rows),
-                      static_cast<int64_t>(info.c_matrix.cols)},
-                     {info.c_matrix.row_stride, info.c_matrix.col_stride}),
+                     {static_cast<int64_t>(_info.c_matrix.rows),
+                      static_cast<int64_t>(_info.c_matrix.cols)},
+                     {_info.c_matrix.row_stride, _info.c_matrix.col_stride}),
                 INFINIOP_STATUS_SUCCESS);
-    CHECK_STATUS(aDesc->setDescriptor(
+    CHECK_STATUS(a->setDescriptor(
                     toAclDataType(a_desc->dtype),
-                     {static_cast<int64_t>(info.a_matrix.rows),
-                      static_cast<int64_t>(info.a_matrix.cols)},
-                     {info.a_matrix.row_stride, info.a_matrix.col_stride}),
+                     {static_cast<int64_t>(_info.a_matrix.rows),
+                      static_cast<int64_t>(_info.a_matrix.cols)},
+                     {_info.a_matrix.row_stride, _info.a_matrix.col_stride}),
                 INFINIOP_STATUS_SUCCESS);
-    CHECK_STATUS(bDesc->setDescriptor(
+    CHECK_STATUS(b->setDescriptor(
                     toAclDataType(b_desc->dtype),
-                     {static_cast<int64_t>(info.b_matrix.rows),
-                      static_cast<int64_t>(info.b_matrix.cols)},
-                     {info.b_matrix.row_stride, info.b_matrix.col_stride}),
+                     {static_cast<int64_t>(_info.b_matrix.rows),
+                      static_cast<int64_t>(_info.b_matrix.cols)},
+                     {_info.b_matrix.row_stride, _info.b_matrix.col_stride}),
                 INFINIOP_STATUS_SUCCESS);

-    CHECK_STATUS(cDesc->createTensor(), INFINIOP_STATUS_SUCCESS);
-    CHECK_STATUS(aDesc->createTensor(), INFINIOP_STATUS_SUCCESS);
-    CHECK_STATUS(bDesc->createTensor(), INFINIOP_STATUS_SUCCESS);
+    CHECK_STATUS(c->createTensor(), INFINIOP_STATUS_SUCCESS);
+    CHECK_STATUS(a->createTensor(), INFINIOP_STATUS_SUCCESS);
+    CHECK_STATUS(b->createTensor(), INFINIOP_STATUS_SUCCESS);

-    auto tc = cDesc->t,
-         ta = aDesc->t,
-         tb = bDesc->t;
+    auto tc = c->t,
+         ta = a->t,
+         tb = b->t;
    aclOpExecutor *executor;
-    size_t workspaceSize;
+    size_t workspace_size;
    // aclnnGemm support C = alpha * A @ B + beta * C
    // see
    // https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/aolapi/context/aclnnGemm.md
    // use alpha = 0.5, beta = 0.5 temporarily

    int8_t mt = 1;
-    auto ret = aclnnGemmGetWorkspaceSize(ta, tb, tc, .5, .5, 0, 0, tc, mt, &workspaceSize, &executor);
+    auto ret = aclnnGemmGetWorkspaceSize(ta, tb, tc, .5, .5, 0, 0, tc, mt, &workspace_size, &executor);
    CHECK_RET(ret == ACL_SUCCESS,
              LOG_PRINT("aclnnGemmGetWorkspaceSize failed. ERROR: %d\n", ret);
              return INFINIOP_STATUS_INTERNAL_ERROR);
    aclSetAclOpExecutorRepeatable(executor);

    *desc_ptr = new Descriptor(
-        dtype, info, workspaceSize,
+        dtype, _info, workspace_size,
        new Opaque{
            executor,
-            cDesc,
-            aDesc,
-            bDesc,
+            c,
+            a,
+            b,
            mt,
        },
        handle->device, handle->device_id);
@@ -115,28 +117,29 @@ infiniopStatus_t Descriptor::calculate(
    float alpha,
    void *stream) const {

-    auto tc = _opaque->cDesc->t,
-         ta = _opaque->aDesc->t,
-         tb = _opaque->bDesc->t;
+    auto tc = _opaque->c->t,
+         ta = _opaque->a->t,
+         tb = _opaque->b->t;

-    size_t workspaceSize;
+    size_t workspace_size;
    auto ret = aclnnGemmGetWorkspaceSize(
        ta, tb, tc, alpha, beta, 0, 0, tc, _opaque->mt,
-        &workspaceSize, &(_opaque->executor));
+        &workspace_size, &(_opaque->executor));
    CHECK_RET(ret == ACL_SUCCESS,
              LOG_PRINT("aclnnGemmGetWorkspaceSize failed. ERROR: %d\n", ret);
              return INFINIOP_STATUS_INTERNAL_ERROR);
-    if (workspaceSize_ < workspaceSize) {
+    if (workspaceSize_ < workspace_size) {
        return INFINIOP_STATUS_INSUFFICIENT_WORKSPACE;
    }
    aclSetAclOpExecutorRepeatable(_opaque->executor);

-    for (size_t i = 0; i < info.batch; ++i) {
-        AclSetTensorAddr(_opaque->executor, 0, ta, ((char *)a) + i * info.a_matrix.stride * infiniSizeof(dtype));
-        AclSetTensorAddr(_opaque->executor, 1, tb, ((char *)b) + i * info.b_matrix.stride * infiniSizeof(dtype));
-        AclSetTensorAddr(_opaque->executor, 2, tc, ((char *)c) + i * info.c_matrix.stride * infiniSizeof(dtype));
-        AclSetTensorAddr(_opaque->executor, 3, tc, ((char *)c) + i * info.c_matrix.stride * infiniSizeof(dtype));
-        ret = aclnnGemm(workspace, workspaceSize, _opaque->executor, stream);
+    auto unit = infiniSizeof(_dtype);
+    for (size_t i = 0; i < _info.batch; ++i) {
+        AclSetTensorAddr(_opaque->executor, 0, ta, ((char *)a) + i * _info.a_matrix.stride * unit);
+        AclSetTensorAddr(_opaque->executor, 1, tb, ((char *)b) + i * _info.b_matrix.stride * unit);
+        AclSetTensorAddr(_opaque->executor, 2, tc, ((char *)c) + i * _info.c_matrix.stride * unit);
+        AclSetTensorAddr(_opaque->executor, 3, tc, ((char *)c) + i * _info.c_matrix.stride * unit);
+        ret = aclnnGemm(workspace, workspace_size, _opaque->executor, stream);
        CHECK_RET(ret == ACL_SUCCESS,
                  LOG_PRINT("aclnnGemm failed. ERROR: %d\n", ret);
                  return INFINIOP_STATUS_INTERNAL_ERROR);

--- a/src/infiniop/ops/matmul/ascend/matmul_ascend.h
+++ b/src/infiniop/ops/matmul/ascend/matmul_ascend.h
 #ifndef __MATMUL_ASCEND_H__
 #define __MATMUL_ASCEND_H__

-#include "../../../devices/ascend/ascend_handle.h"
 #include "../matmul.h"

-DESCRIPTOR(ascend, infiniopAscendHandle_t)
+DESCRIPTOR(ascend)

 #endif // __MATMUL_ASCEND_H__
--- a/src/infiniop/ops/matmul/bang/matmul_bang.cc
+++ b/src/infiniop/ops/matmul/bang/matmul_bang.cc
 #include "matmul_bang.h"
+#include "../../../devices/bang/bang_handle.h"
 #include "../../../devices/bang/common_bang.h"
 #include "../../utils.h"
 #include <cnnl_extra.h>
@@ -6,17 +7,17 @@
 namespace matmul::bang {

 struct Descriptor::Opaque {
-    cnnlMatMulDescriptor_t opDesc;
+    cnnlMatMulDescriptor_t op;
    cnnlMatMulAlgo_t algo;
    cnnlMatMulHeuristicResult_t algoResult;
-    cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
+    cnnlTensorDescriptor_t a, b, c;
    std::shared_ptr<Pool<cnnlHandle_t>> cnnl_handle_pool;

    ~Opaque() {
-        cnnlDestroyTensorDescriptor(aDesc);
-        cnnlDestroyTensorDescriptor(bDesc);
-        cnnlDestroyTensorDescriptor(cDesc);
-        cnnlMatMulDescDestroy(opDesc);
+        cnnlDestroyTensorDescriptor(a);
+        cnnlDestroyTensorDescriptor(b);
+        cnnlDestroyTensorDescriptor(c);
+        cnnlMatMulDescDestroy(op);
        cnnlMatMulAlgoDestroy(algo);
        cnnlDestroyMatMulHeuristicResult(algoResult);
    }
@@ -59,41 +60,42 @@ Descriptor::~Descriptor() {
 }

 infiniopStatus_t Descriptor::create(
-    infiniopBangHandle_t handle,
+    infiniopHandle_t handle_,
    Descriptor **desc_ptr,
    infiniopTensorDescriptor_t c_desc,
    infiniopTensorDescriptor_t a_desc,
    infiniopTensorDescriptor_t b_desc) {
-    infiniDtype_t dtype = c_desc->dtype;
+    auto handle = reinterpret_cast<infiniopBangHandle_t>(handle_);
+    auto dtype = c_desc->dtype;

    if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
        return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
    }

    infiniopStatus_t status;
-    auto info = MatmulInfo(c_desc, a_desc, b_desc, &status, MatrixLayout::ROW_MAJOR);
+    auto _info = MatmulInfo(c_desc, a_desc, b_desc, &status, MatrixLayout::ROW_MAJOR);
    if (status != INFINIOP_STATUS_SUCCESS) {
        return status;
    }

-    cnnlTensorDescriptor_t aDesc, bDesc, cDesc;
-    cnnlCreateTensorDescriptor(&aDesc);
-    cnnlCreateTensorDescriptor(&bDesc);
-    cnnlCreateTensorDescriptor(&cDesc);
+    cnnlTensorDescriptor_t a, b, c;
+    cnnlCreateTensorDescriptor(&a);
+    cnnlCreateTensorDescriptor(&b);
+    cnnlCreateTensorDescriptor(&c);

-    setMatrixTensorEx(aDesc, info.a_matrix, a_desc->dtype);
-    setMatrixTensorEx(bDesc, info.b_matrix, b_desc->dtype);
-    setMatrixTensorEx(cDesc, info.c_matrix, c_desc->dtype);
+    setMatrixTensorEx(a, _info.a_matrix, a_desc->dtype);
+    setMatrixTensorEx(b, _info.b_matrix, b_desc->dtype);
+    setMatrixTensorEx(c, _info.c_matrix, c_desc->dtype);

-    cnnlMatMulDescriptor_t opDesc;
+    cnnlMatMulDescriptor_t op;
    cnnlMatMulAlgo_t algo;
    cnnlMatMulHeuristicResult_t algoResult;
-    cnnlMatMulDescCreate(&opDesc);
+    cnnlMatMulDescCreate(&op);
    cnnlMatMulAlgoCreate(&algo);
    cnnlCreateMatMulHeuristicResult(&algoResult);
    int32_t use_stride = true;
    cnnlSetMatMulDescAttr(
-        opDesc,
+        op,
        CNNL_MATMUL_USE_STRIDE,
        &use_stride,
        sizeof(int32_t));
@@ -102,7 +104,7 @@ infiniopStatus_t Descriptor::create(
             [&](cnnlHandle_t _handle) {
                 cnnlGetBatchMatMulAlgoHeuristic(
                     _handle,
-                     opDesc, aDesc, bDesc, cDesc,
+                     op, a, b, c,
                     NULL, 1, &algoResult, &count);
             });

@@ -110,14 +112,14 @@ infiniopStatus_t Descriptor::create(
    cnnlGetBatchMatMulHeuristicResult(algoResult, algo, &workspace_size);

    *desc_ptr = new Descriptor(
-        dtype, info, workspace_size,
+        dtype, _info, workspace_size,
        new Opaque{
-            opDesc,
+            op,
            algo,
            algoResult,
-            aDesc,
-            bDesc,
-            cDesc,
+            a,
+            b,
+            c,
            handle->cnnl_handle_pool},
        handle->device, handle->device_id);
    return INFINIOP_STATUS_SUCCESS;
@@ -133,7 +135,7 @@ infiniopStatus_t Descriptor::calculate(
    float alpha,
    void *stream) const {

-    if (info.is_transed) {
+    if (_info.is_transed) {
        std::swap(a, b);
    }
    use_cnnl(_opaque->cnnl_handle_pool,
@@ -141,13 +143,13 @@ infiniopStatus_t Descriptor::calculate(
             [&](cnnlHandle_t handle) {
                 cnnlBatchMatMulBCast_v2(
                     handle,
-                     _opaque->opDesc,
+                     _opaque->op,
                     _opaque->algo,
                     &alpha,
-                     _opaque->aDesc, a,
-                     _opaque->bDesc, b,
+                     _opaque->a, a,
+                     _opaque->b, b,
                     &beta,
-                     _opaque->cDesc, c,
+                     _opaque->c, c,
                     workspace,
                     workspace_size);
             });

--- a/src/infiniop/ops/matmul/bang/matmul_bang.h
+++ b/src/infiniop/ops/matmul/bang/matmul_bang.h
 #ifndef __MATMUL_BANG_H__
 #define __MATMUL_BANG_H__

-#include "../../../devices/bang/bang_handle.h"
 #include "../matmul.h"

-DESCRIPTOR(bang, infiniopBangHandle_t)
+DESCRIPTOR(bang)

 #endif // __MATMUL_BANG_H__
--- a/src/infiniop/ops/matmul/cpu/matmul_cpu.cc
+++ b/src/infiniop/ops/matmul/cpu/matmul_cpu.cc
 #include "./matmul_cpu.h"
 #include "../../../devices/cpu/common_cpu.h"
+#include "../../../devices/cpu/cpu_handle.h"
 #include <iostream>

 namespace matmul::cpu {
@@ -7,25 +8,26 @@ namespace matmul::cpu {
 Descriptor::~Descriptor() = default;

 infiniopStatus_t Descriptor::create(
-    infiniopCpuHandle_t handle,
+    infiniopHandle_t handle_,
    Descriptor **desc_ptr,
    infiniopTensorDescriptor_t c_desc,
    infiniopTensorDescriptor_t a_desc,
    infiniopTensorDescriptor_t b_desc) {
-    infiniDtype_t dtype = c_desc->dtype;
+    auto handle = reinterpret_cast<infiniopCpuHandle_t>(handle_);
+    auto dtype = c_desc->dtype;

    if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
        return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
    }

    infiniopStatus_t status;
-    auto info = MatmulInfo(c_desc, a_desc, b_desc, &status, MatrixLayout::COL_MAJOR);
+    auto _info = MatmulInfo(c_desc, a_desc, b_desc, &status, MatrixLayout::COL_MAJOR);
    if (status != INFINIOP_STATUS_SUCCESS) {
        return status;
    }

    *desc_ptr = new Descriptor(
-        dtype, info, 0,
+        dtype, _info, 0,
        nullptr,
        handle->device, handle->device_id);
    return INFINIOP_STATUS_SUCCESS;
@@ -33,26 +35,24 @@ infiniopStatus_t Descriptor::create(

 template <typename Tdata>
 void calculate(
-    Descriptor const *desc,
+    MatmulInfo const &_info,
    void *c,
    float beta,
    void const *a,
    void const *b,
    float alpha) {
-    auto info = desc->info;
-
-    if (info.is_transed) {
+    if (_info.is_transed) {
        std::swap(a, b);
    }

-    for (size_t i = 0; i < info.batch; ++i) {
-        for (size_t m_ = 0; m_ < info.m; ++m_) {
-            for (size_t n_ = 0; n_ < info.n; ++n_) {
-                auto c_ = reinterpret_cast<Tdata *>(c) + i * info.c_matrix.stride + m_ * info.c_matrix.row_stride + n_ * info.c_matrix.col_stride;
+    for (size_t i = 0; i < _info.batch; ++i) {
+        for (size_t m_ = 0; m_ < _info.m; ++m_) {
+            for (size_t n_ = 0; n_ < _info.n; ++n_) {
+                auto c_ = reinterpret_cast<Tdata *>(c) + i * _info.c_matrix.stride + m_ * _info.c_matrix.row_stride + n_ * _info.c_matrix.col_stride;
                float sum = 0;
-                for (size_t k_ = 0; k_ < info.k; ++k_) {
-                    auto a_ = reinterpret_cast<Tdata const *>(a) + i * info.a_matrix.stride + m_ * info.a_matrix.row_stride + k_ * info.a_matrix.col_stride;
-                    auto b_ = reinterpret_cast<Tdata const *>(b) + i * info.b_matrix.stride + n_ * info.b_matrix.col_stride + k_ * info.b_matrix.row_stride;
+                for (size_t k_ = 0; k_ < _info.k; ++k_) {
+                    auto a_ = reinterpret_cast<Tdata const *>(a) + i * _info.a_matrix.stride + m_ * _info.a_matrix.row_stride + k_ * _info.a_matrix.col_stride;
+                    auto b_ = reinterpret_cast<Tdata const *>(b) + i * _info.b_matrix.stride + n_ * _info.b_matrix.col_stride + k_ * _info.b_matrix.row_stride;
                    if constexpr (std::is_same<Tdata, uint16_t>::value) {
                        sum += f16_to_f32(*a_) * f16_to_f32(*b_);
                    } else {
@@ -83,13 +83,13 @@ infiniopStatus_t Descriptor::calculate(
    float alpha,
    void *stream) const {

-    switch (dtype) {
+    switch (_dtype) {
    case INFINI_DTYPE_F16:
-        cpu::calculate<uint16_t>(this, c, beta, a, b, alpha);
+        cpu::calculate<uint16_t>(_info, c, beta, a, b, alpha);
        return INFINIOP_STATUS_SUCCESS;

    case INFINI_DTYPE_F32:
-        cpu::calculate<float>(this, c, beta, a, b, alpha);
+        cpu::calculate<float>(_info, c, beta, a, b, alpha);
        return INFINIOP_STATUS_SUCCESS;

    default:

--- a/src/infiniop/ops/matmul/cpu/matmul_cpu.h
+++ b/src/infiniop/ops/matmul/cpu/matmul_cpu.h
 #ifndef __MATMUL_CPU_H__
 #define __MATMUL_CPU_H__

-#include "../../../devices/cpu/cpu_handle.h"
 #include "../matmul.h"

-DESCRIPTOR(cpu, infiniopCpuHandle_t)
+DESCRIPTOR(cpu)

 #endif // __MATMUL_CPU_H__
--- a/src/infiniop/ops/matmul/cuda/matmul_cuda.cu
+++ b/src/infiniop/ops/matmul/cuda/matmul_cuda.cu
+#include "../../../devices/cuda/common_cuda.cuh"
 #include "../../utils.h"
 #include "matmul_cuda.cuh"

@@ -12,25 +13,26 @@ Descriptor::~Descriptor() {
 }

 infiniopStatus_t Descriptor::create(
-    infiniopCudaHandle_t handle,
+    infiniopHandle_t handle_,
    Descriptor **desc_ptr,
    infiniopTensorDescriptor_t c_desc,
    infiniopTensorDescriptor_t a_desc,
    infiniopTensorDescriptor_t b_desc) {
-    infiniDtype_t dtype = c_desc->dtype;
+    auto handle = reinterpret_cast<infiniopCudaHandle_t>(handle_);
+    auto dtype = c_desc->dtype;

    if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
        return INFINIOP_STATUS_BAD_TENSOR_DTYPE;
    }

    infiniopStatus_t status;
-    auto info = MatmulInfo(c_desc, a_desc, b_desc, &status, MatrixLayout::COL_MAJOR);
+    auto _info = MatmulInfo(c_desc, a_desc, b_desc, &status, MatrixLayout::COL_MAJOR);
    if (status != INFINIOP_STATUS_SUCCESS) {
        return status;
    }

    *desc_ptr = new Descriptor(
-        dtype, info, 0,
+        dtype, _info, 0,
        new Opaque{handle->cublas_handle_pool},
        handle->device, handle->device_id);
    return INFINIOP_STATUS_SUCCESS;
@@ -38,7 +40,7 @@ infiniopStatus_t Descriptor::create(

 template <typename Tdata>
 infiniopStatus_t calculate(
-    MatmulInfo const &info,
+    MatmulInfo const &_info,
    std::shared_ptr<Pool<cublasHandle_t>> &cublas_handle_pool,
    void *c,
    float beta,
@@ -47,7 +49,7 @@ infiniopStatus_t calculate(
    float alpha,
    cudaStream_t stream) {

-    if (info.is_transed) {
+    if (_info.is_transed) {
        std::swap(a, b);
    }

@@ -65,8 +67,8 @@ infiniopStatus_t calculate(
 #endif
    }

-    auto op_a = info.a_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
-    auto op_b = info.b_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
+    auto op_a = _info.a_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
+    auto op_b = _info.b_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;

    use_cublas(cublas_handle_pool,
               stream,
@@ -75,24 +77,24 @@ infiniopStatus_t calculate(
                       handle,
                       op_a,
                       op_b,
-                       static_cast<int>(info.m),
-                       static_cast<int>(info.n),
-                       static_cast<int>(info.k),
+                       static_cast<int>(_info.m),
+                       static_cast<int>(_info.n),
+                       static_cast<int>(_info.k),
                       &alpha,
                       a,
                       a_type,
-                       static_cast<int>(info.a_matrix.ld()),
-                       info.a_matrix.stride,
+                       static_cast<int>(_info.a_matrix.ld()),
+                       _info.a_matrix.stride,
                       b,
                       b_type,
-                       static_cast<int>(info.b_matrix.ld()),
-                       info.b_matrix.stride,
+                       static_cast<int>(_info.b_matrix.ld()),
+                       _info.b_matrix.stride,
                       &beta,
                       c,
                       c_type,
-                       static_cast<int>(info.c_matrix.ld()),
-                       info.c_matrix.stride,
-                       static_cast<int>(info.batch),
+                       static_cast<int>(_info.c_matrix.ld()),
+                       _info.c_matrix.stride,
+                       static_cast<int>(_info.batch),
                       compute_type,
                       CUBLAS_GEMM_DEFAULT_TENSOR_OP);
               });
@@ -109,13 +111,13 @@ infiniopStatus_t Descriptor::calculate(
    float alpha,
    void *stream) const {

-    switch (dtype) {
+    switch (_dtype) {
    case INFINI_DTYPE_F16:
-        cuda::calculate<uint16_t>(info, _opaque->cublas_handle_pool, c, beta, a, b, alpha, (cudaStream_t)stream);
+        cuda::calculate<uint16_t>(_info, _opaque->cublas_handle_pool, c, beta, a, b, alpha, (cudaStream_t)stream);
        return INFINIOP_STATUS_SUCCESS;

    case INFINI_DTYPE_F32:
-        cuda::calculate<float>(info, _opaque->cublas_handle_pool, c, beta, a, b, alpha, (cudaStream_t)stream);
+        cuda::calculate<float>(_info, _opaque->cublas_handle_pool, c, beta, a, b, alpha, (cudaStream_t)stream);
        return INFINIOP_STATUS_SUCCESS;

    default:

--- a/src/infiniop/ops/matmul/cuda/matmul_cuda.cuh
+++ b/src/infiniop/ops/matmul/cuda/matmul_cuda.cuh
 #ifndef __MATMUL_CUDA_CUH__
 #define __MATMUL_CUDA_CUH__

-#include "../../../devices/cuda/cuda_handle.h"
 #include "../matmul.h"

-DESCRIPTOR(cuda, infiniopCudaHandle_t)
+DESCRIPTOR(cuda)

 #endif // __MATMUL_CUDA_CUH__
--- a/src/infiniop/ops/matmul/matmul.h
+++ b/src/infiniop/ops/matmul/matmul.h
@@ -2,37 +2,38 @@
 #define __MATMUL_H__

 #include "blas.h"
+#include "infiniop/handle.h"
 #include "infiniop/operator.h"

-#define DESCRIPTOR(NAMESPACE, HANDLE)                     \
+#define DESCRIPTOR(NAMESPACE)                             \
                                                          \
    namespace matmul::NAMESPACE {                         \
    class Descriptor final : public InfiniopDescriptor {  \
        struct Opaque;                                    \
        Opaque *_opaque;                                  \
+        infiniDtype_t _dtype;                             \
+        MatmulInfo _info;                                 \
                                                          \
        Descriptor(                                       \
-            infiniDtype_t dtype_,                         \
-            MatmulInfo info_,                             \
+            infiniDtype_t dtype,                          \
+            MatmulInfo info,                              \
            size_t workspace_size_,                       \
            Opaque *opaque,                               \
            infiniDevice_t device_type,                   \
            int device_id)                                \
            : InfiniopDescriptor{device_type, device_id}, \
              _opaque(opaque),                            \
-              dtype(dtype_),                              \
-              info(info_),                                \
+              _dtype(dtype),                              \
+              _info(info),                                \
              workspace_size(workspace_size_) {}          \
                                                          \
    public:                                               \
-        infiniDtype_t dtype;                              \
-        MatmulInfo info;                                  \
        size_t workspace_size;                            \
                                                          \
        ~Descriptor();                                    \
                                                          \
        static infiniopStatus_t create(                   \
-            HANDLE handle,                                \
+            infiniopHandle_t handle,                      \
            Descriptor **desc_ptr,                        \
            infiniopTensorDescriptor_t c_desc,            \
            infiniopTensorDescriptor_t a_desc,            \

--- a/src/infiniop/ops/matmul/operator.cc
+++ b/src/infiniop/ops/matmul/operator.cc
@@ -20,10 +20,10 @@ __C infiniopStatus_t infiniopCreateMatmulDescriptor(
    infiniopTensorDescriptor_t a_desc,
    infiniopTensorDescriptor_t b_desc) {

-#define CREATE(CASE, HANDLE, NAMESPACE)                                   \
+#define CREATE(CASE, NAMESPACE)                                           \
    case CASE:                                                            \
        return matmul::NAMESPACE::Descriptor::create(                     \
-            reinterpret_cast<HANDLE>(handle),                             \
+            handle,                                                       \
            reinterpret_cast<matmul::NAMESPACE::Descriptor **>(desc_ptr), \
            c_desc,                                                       \
            a_desc,                                                       \
@@ -32,16 +32,16 @@ __C infiniopStatus_t infiniopCreateMatmulDescriptor(
    switch (handle->device) {

 #ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, infiniopCpuHandle_t, cpu);
+        CREATE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_CUDA_API
-        CREATE(INFINI_DEVICE_NVIDIA, infiniopCudaHandle_t, cuda);
+        CREATE(INFINI_DEVICE_NVIDIA, cuda);
 #endif
 #ifdef ENABLE_CAMBRICON_API
-        CREATE(INFINI_DEVICE_CAMBRICON, infiniopBangHandle_t, bang);
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
 #endif
 #ifdef ENABLE_ASCEND_API
-        CREATE(INFINI_DEVICE_ASCEND, infiniopAscendHandle_t, ascend);
+        CREATE(INFINI_DEVICE_ASCEND, ascend);
 #endif

    default: