issue/111: Merge branch 'main' of github.com:PanZezhong1725/InfiniCore into...

issue/111: Merge branch 'main' of github.com:PanZezhong1725/InfiniCore into issue/111-rmsnorm-kunlun

issue/111: Merge branch 'main' of github.com:PanZezhong1725/InfiniCore into...
issue/111: Merge branch 'main' of github.com:PanZezhong1725/InfiniCore into issue/111-rmsnorm-kunlun
d854dbee · zhangyue · 23ddc20b · a474a6f5 · d854dbee · d854dbee
Commit d854dbee authored Apr 07, 2025 by zhangyue
20 changed files
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -20,35 +20,23 @@ jobs:
    - name: checkout code
      uses: actions/checkout@v4
-    - name: install black
+    - name: Check Format
-      run: pip install black
+      run: |
+        pip install black
-    - name: check format
+        python3 scripts/format.py --path src --check
-      run: python3 scripts/format.py --path src --check
    - name: install xmake
      uses: xmake-io/github-action-setup-xmake@v1
      with:
        xmake-version: latest
-    - name: configure xmake
+    - name: Build & Install
-      run: xmake f --omp=y -cv
+      run: python scripts/install.py --omp=y
-    - name: build with xmake
-      run: xmake build
-    - name: install to INFINI_ROOT
+    - name: install python packages
-      if: matrix.os != 'windows-latest'
-      run: xmake install
-    - name: build infiniop-test
-      if: matrix.os != 'windows-latest'
-      run: xmake build infiniop-test
-    - name: python test
-      if: matrix.os != 'windows-latest'
      run: |
+        pip install numpy
        pip install torch
-        LD_LIBRARY_PATH=$HOME/.infini/lib python test/infiniop/gemm.py --cpu
-        LD_LIBRARY_PATH=$HOME/.infini/lib python test/infiniop/rms_norm.py --cpu
+    - name: Python Test
-        LD_LIBRARY_PATH=$HOME/.infini/lib python test/infiniop/random_sample.py --cpu
+      run: python scripts/python_test.py --cpu
--- a/LICENSE
+++ b/LICENSE
+The MIT License (MIT)
+Copyright © 2025 InfiniTensor
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/README.md
+++ b/README.md
@@ -15,6 +15,34 @@ InfiniCore 是一个跨平台统一编程工具集，为不同芯片平台的功
 ## 配置和使用
+### 软件依赖
+- XMake编译器
+  XMake配置选项（`XMAKE_CONFIG_FLAGS`）以及含义
+  - `--omp=[y|n]` 是否使用OpenMP，默认开启
+  - `--cpu=[y|n]` 是否编译CPU接口实现，默认开启
+  - `--nv-gpu=[y|n]` 是否编译英伟达GPU接口实现
+  - `--ascend-npu=[y|n]` 是否编译昇腾NPU接口实现
+  - `--cambricon-mlu=[y|n]` 是否编译寒武纪MLU接口实现
+  - `--metax-gpu=[y|n]` 是否编译沐曦GPU接口实现
+  - `--moore-gpu=[y|n]` 是否编译摩尔线程GPU接口实现
+  - `--sugon-dcu=[y|n]` 是否编译曙光DCU接口实现
+  - `--kunlun-xpu=[y|n]` 是否编译昆仑XPU接口实现
+### 一键安装
+在 `script/` 目录中提供了 `install.py` 安装脚本。使用方式如下：
+```shell
+cd InfiniCore
+python scripts/install.py [XMAKE_CONFIG_FLAGS]
+```
+### 手动安装
 1. 项目配置
   - 查看当前配置
@@ -55,11 +83,23 @@ InfiniCore 是一个跨平台统一编程工具集，为不同芯片平台的功
   按输出提示设置 `INFINI_ROOT` 和 `LD_LIBRARY_PATH` 环境变量。
-4. 运行算子测试
+### 运行测试
-   ```shell
+#### 运行Python算子测试
-   python test/infiniop/[operator].py [--cpu | --nvidia | --cambricon | --ascend]
-   ```
+```shell
+python test/infiniop/[operator].py [--cpu | --nvidia | --cambricon | --ascend]
+```
+#### 一键运行所有Python算子测试
+```shell
+python scripts/python_test.py [--cpu | --nvidia | --cambricon | --ascend]
+```
+#### 算子测试框架
+详见 `test/infiniop-test` 目录
 ## 开发指南

--- a/include/infinicore.h
+++ b/include/infinicore.h
@@ -65,10 +65,10 @@ typedef enum {
    INFINI_DTYPE_F16 = 12,
    INFINI_DTYPE_F32 = 13,
    INFINI_DTYPE_F64 = 14,
-    INFINI_DTYPE_C8 = 15,
+    INFINI_DTYPE_C16 = 15,
-    INFINI_DTYPE_C16 = 16,
+    INFINI_DTYPE_C32 = 16,
-    INFINI_DTYPE_C32 = 17,
+    INFINI_DTYPE_C64 = 17,
-    INFINI_DTYPE_C64 = 18,
+    INFINI_DTYPE_C128 = 18,
    INFINI_DTYPE_BF16 = 19,
 } infiniDtype_t;

--- a/scripts/install.py
+++ b/scripts/install.py
+import os
+import subprocess
+import platform
+import sys
+from set_env import set_env
+PROJECT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+os.chdir(PROJECT_DIR)
+def run_cmd(cmd):
+    subprocess.run(cmd, text=True, encoding="utf-8", check=True, shell=True)
+def install(xmake_config_flags=""):
+    run_cmd(f"xmake f {xmake_config_flags} -cv")
+    run_cmd("xmake")
+    run_cmd("xmake install")
+    run_cmd("xmake build infiniop-test")
+    run_cmd("xmake install infiniop-test")
+if __name__ == "__main__":
+    set_env()
+    install(" ".join(sys.argv[1:]))
--- a/scripts/python_test.py
+++ b/scripts/python_test.py
+import os
+import subprocess
+from set_env import set_env
+import sys
+PROJECT_DIR = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), "..", "test", "infiniop")
+)
+os.chdir(PROJECT_DIR)
+def run_tests(args):
+    failed = []
+    for test in [
+        "gemm.py",
+        "rms_norm.py",
+        "causal_softmax.py",
+        "swiglu.py",
+        "random_sample.py",
+    ]:
+        result = subprocess.run(
+            f"python {test} {args}", text=True, encoding="utf-8", shell=True
+        )
+        if result.returncode != 0:
+            failed.append(test)
+    return failed
+if __name__ == "__main__":
+    set_env()
+    failed = run_tests(" ".join(sys.argv[1:]))
+    if len(failed) == 0:
+        print("\033[92mAll tests passed!\033[0m")
+    else:
+        print("\033[91mThe following tests failed:\033[0m")
+        for test in failed:
+            print(f"\033[91m - {test}\033[0m")
+    exit(len(failed))
--- a/scripts/set_env.py
+++ b/scripts/set_env.py
+import os
+import platform
+def set_env():
+    if os.environ.get("INFINI_ROOT") == None:
+        os.environ["INFINI_ROOT"] = os.path.expanduser("~/.infini")
+    if platform.system() == "Windows":
+        new_path = os.path.expanduser(os.environ.get("INFINI_ROOT") + "/bin")
+        if new_path not in os.environ.get("PATH", ""):
+            os.environ["PATH"] = f"{new_path};{os.environ.get('PATH', '')}"
+    elif platform.system() == "Linux":
+        new_path = os.path.expanduser(os.environ.get("INFINI_ROOT") + "/bin")
+        if new_path not in os.environ.get("PATH", ""):
+            os.environ["PATH"] = f"{new_path}:{os.environ.get('PATH', '')}"
+        new_lib_path = os.path.expanduser(os.environ.get("INFINI_ROOT") + "/lib")
+        if new_lib_path not in os.environ.get("LD_LIBRARY_PATH", ""):
+            os.environ["LD_LIBRARY_PATH"] = (
+                f"{new_lib_path}:{os.environ.get('LD_LIBRARY_PATH', '')}"
+            )
+    else:
+        raise RuntimeError("Unsupported platform.")
--- a/src/infiniop/devices/ascend/common_ascend.cc
+++ b/src/infiniop/devices/ascend/common_ascend.cc
@@ -8,6 +8,10 @@ std::vector<int64_t> inferStorageShape(std::vector<int64_t> shape, std::vector<i
    return storageShape;
 }
+size_t aclnnTensorDescriptor::numel() const {
+    return std::accumulate(shape.begin(), shape.end(), (size_t)1, std::multiplies<size_t>());
+}
 aclnnTensorDescriptor::aclnnTensorDescriptor(infiniopTensorDescriptor_t desc, void *data) {
    this->ndim = desc->ndim();
    this->shape = std::vector<int64_t>(ndim);

--- a/src/infiniop/devices/ascend/common_ascend.h
+++ b/src/infiniop/devices/ascend/common_ascend.h
@@ -34,10 +34,10 @@ struct aclnnTensorDescriptor {
    int64_t storageNdim = 1;
    aclTensor *tensor;
-    // aclnnGemmGetWorkspaceSize only support 2D matrix multiply, so we need to convert 3D tensor to 2D tensor
    aclnnTensorDescriptor(aclDataType dtype, const std::vector<int64_t> &shape, const std::vector<int64_t> &strides, void *data = nullptr);
    aclnnTensorDescriptor(infiniopTensorDescriptor_t y_desc, void *data = nullptr);
    ~aclnnTensorDescriptor();
+    size_t numel() const;
    std::string toString();
 };

--- a/src/infiniop/devices/handle.cc
+++ b/src/infiniop/devices/handle.cc
@@ -14,6 +14,9 @@
 #ifdef ENABLE_ASCEND_API
 #include "ascend/ascend_handle.h"
 #endif
+#ifdef ENABLE_MOORE_API
+#include "musa/musa_handle.h"
+#endif
 #ifdef ENABLE_KUNLUN_API
 #include "kunlun/kunlun_handle.h"
 #endif
@@ -47,6 +50,9 @@ __C infiniStatus_t infiniopCreateHandle(infiniopHandle_t *handle_ptr) {
 #ifdef ENABLE_ASCEND_API
        CREATE(INFINI_DEVICE_ASCEND, ascend);
 #endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, musa);
+#endif
 #ifdef ENABLE_KUNLUN_API
        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
 #endif
@@ -81,6 +87,9 @@ __C infiniStatus_t infiniopDestroyHandle(infiniopHandle_t handle) {
 #ifdef ENABLE_ASCEND_API
        DELETE(INFINI_DEVICE_ASCEND, ascend);
 #endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, musa);
+#endif
 #ifdef ENABLE_KUNLUN_API
        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
 #endif

--- a/src/infiniop/devices/musa/common_musa.h
+++ b/src/infiniop/devices/musa/common_musa.h
+#include "../../../utils.h"
+#include "../pool.h"
+#include "musa_handle.h"
+#include <mublas.h>
+#include <mudnn.h>
+#include <musa.h>
+#include <musa_fp16_mtgpu.h>
+#include <musa_runtime_api.h>
+#define CHECK_MUBLAS(API) CHECK_INTERNAL(API, MUBLAS_STATUS_SUCCESS)
+#define CHECK_MUDNN(API) CHECK_INTERNAL((int)API, (int)::musa::dnn::Status::SUCCESS)
+namespace device::musa {
+class Handle::Internal {
+    Pool<std::unique_ptr<mublasHandle_t>> mublas_handles;
+    Pool<std::unique_ptr<::musa::dnn::Handle>> mudnn_handles;
+    template <typename T>
+    using Fn = std::function<infiniStatus_t(T)>;
+public:
+    infiniStatus_t useMublas(musaStream_t stream, const Fn<mublasHandle_t> &f) const;
+    infiniStatus_t useMudnn(musaStream_t stream, const Fn<::musa::dnn::Handle &> &f) const;
+};
+} // namespace device::musa
--- a/src/infiniop/devices/musa/musa_handle.cc
+++ b/src/infiniop/devices/musa/musa_handle.cc
+#include "common_musa.h"
+namespace device::musa {
+Handle::Handle(infiniDevice_t device, int device_id)
+    : InfiniopHandle{device, device_id},
+      _internal(std::make_shared<Handle::Internal>()) {}
+Handle::Handle(int device_id) : Handle(INFINI_DEVICE_MOORE, device_id) {}
+auto Handle::internal() const -> const std::shared_ptr<Internal> & {
+    return _internal;
+}
+infiniStatus_t Handle::Internal::useMublas(musaStream_t stream, const Fn<mublasHandle_t> &f) const {
+    std::unique_ptr<mublasHandle_t> handle;
+    auto opt_handle = mublas_handles.pop();
+    if (opt_handle.has_value()) {
+        handle = std::move(*opt_handle);
+    } else {
+        handle = std::make_unique<mublasHandle_t>();
+        CHECK_MUBLAS(mublasCreate(&(*handle)));
+    }
+    CHECK_MUBLAS(mublasSetStream(*handle, stream));
+    CHECK_STATUS(f(*handle));
+    mublas_handles.push(std::move(handle));
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t Handle::Internal::useMudnn(musaStream_t stream, const Fn<::musa::dnn::Handle &> &f) const {
+    std::unique_ptr<::musa::dnn::Handle> handle;
+    auto opt_handle = mudnn_handles.pop();
+    if (opt_handle.has_value()) {
+        handle = std::move(*opt_handle);
+    } else {
+        handle = std::make_unique<::musa::dnn::Handle>();
+    }
+    CHECK_MUDNN(handle->SetStream(stream));
+    CHECK_STATUS(f(*handle));
+    mudnn_handles.push(std::move(handle));
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t Handle::create(InfiniopHandle **handle_ptr, int device_id) {
+    *handle_ptr = new Handle(INFINI_DEVICE_MOORE, device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace device::musa
--- a/src/infiniop/devices/musa/musa_handle.h
+++ b/src/infiniop/devices/musa/musa_handle.h
+#ifndef __INFINIOP_MUSA_HANDLE_H__
+#define __INFINIOP_MUSA_HANDLE_H__
+#include "../../handle.h"
+#include <memory>
+namespace device::musa {
+struct Handle : public InfiniopHandle {
+    Handle(int device_id);
+    class Internal;
+    auto internal() const -> const std::shared_ptr<Internal> &;
+public:
+    static infiniStatus_t create(InfiniopHandle **handle_ptr, int device_id);
+protected:
+    Handle(infiniDevice_t device, int device_id);
+private:
+    std::shared_ptr<Internal> _internal;
+};
+} // namespace device::musa
+#endif // __INFINIOP_MUSA_HANDLE_H__
--- a/src/infiniop/devices/pool.h
+++ b/src/infiniop/devices/pool.h
@@ -41,7 +41,7 @@ private:
    struct Node {
        U data;
        Node<U> *next;
-        Node(U &&data) : data(data), next(nullptr) {}
+        Node(U &&data) : data(std::move(data)), next(nullptr) {}
    };
    mutable std::atomic<Node<T> *> _head;

--- a/src/infiniop/ops/causal_softmax/causal_softmax.h
+++ b/src/infiniop/ops/causal_softmax/causal_softmax.h
@@ -2,57 +2,10 @@
 #define CAUSAL_SOFTMAX_H
 #include "../../operator.h"
-#include "../../tensor.h"
+#include "info.h"
-#include <iostream>
-#include <vector>
-struct CausalSoftmaxInfo {
-    infiniDtype_t dtype;
-    size_t batch_size;
-    ptrdiff_t stride_b;
-    size_t seq_len;
-    ptrdiff_t stride_i;
-    size_t total_seq_len;
-    ptrdiff_t stride_j;
-};
-inline infiniStatus_t createCausalSoftmaxInfo(CausalSoftmaxInfo *info, infiniopTensorDescriptor_t y_desc) {
-    auto dtype = y_desc->dtype();
-    if (y_desc->dtype() != INFINI_DTYPE_F16 && y_desc->dtype() != INFINI_DTYPE_F32) {
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-    info->dtype = dtype;
-    if (y_desc->ndim() != 2 && y_desc->ndim() != 3) {
-        return INFINI_STATUS_BAD_TENSOR_SHAPE;
-    }
-    if (y_desc->shape()[y_desc->ndim() - 1] < y_desc->shape()[y_desc->ndim() - 2]) {
-        return INFINI_STATUS_BAD_TENSOR_SHAPE;
-    }
-    size_t batch_size = 1;
-    ptrdiff_t stride_b = 0;
-    size_t seq_len = y_desc->shape()[y_desc->ndim() - 2];
-    ptrdiff_t stride_i = y_desc->strides()[y_desc->ndim() - 2];
-    size_t total_seq_len = y_desc->shape()[y_desc->ndim() - 1];
-    ptrdiff_t stride_j = y_desc->strides()[y_desc->ndim() - 1];
-    if (y_desc->ndim() == 3) {
-        stride_b = y_desc->strides()[0];
-        batch_size = y_desc->shape()[0];
-    }
-    info->batch_size = batch_size;
-    info->stride_b = stride_b;
-    info->seq_len = seq_len;
-    info->stride_i = stride_i;
-    info->total_seq_len = total_seq_len;
-    info->stride_j = stride_j;
-    return INFINI_STATUS_SUCCESS;
-}
 #define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
    namespace op::causal_softmax::NAMESPACE {                    \
    class Descriptor final : public InfiniopDescriptor {         \
        struct Opaque;                                           \
@@ -65,20 +18,26 @@ inline infiniStatus_t createCausalSoftmaxInfo(CausalSoftmaxInfo *info, infiniopT
            CausalSoftmaxInfo info,                              \
            size_t workspace_size,                               \
            infiniDevice_t device_type,                          \
-            int device_id) : InfiniopDescriptor{device_type, device_id}, \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
              _opaque(opaque),                                   \
              _info(info),                                       \
              _workspace_size(workspace_size) {}                 \
                                                                 \
    public:                                                      \
        ~Descriptor();                                           \
+                                                                 \
        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
        static infiniStatus_t create(                            \
            infiniopHandle_t handle,                             \
            Descriptor **desc_ptr,                               \
            infiniopTensorDescriptor_t y_desc);                  \
-        infiniStatus_t calculate(void *workspace, size_t workspace_size, \
+                                                                 \
-                                 void *data, void *stream);              \
+        infiniStatus_t calculate(                                \
+            void *workspace, size_t workspace_size,              \
+            void *data,                                          \
+            void *stream) const;                                 \
    };                                                           \
    }

--- a/src/infiniop/ops/causal_softmax/cpu/causal_softmax_cpu.cc
+++ b/src/infiniop/ops/causal_softmax/cpu/causal_softmax_cpu.cc
@@ -3,15 +3,16 @@
 #include "../../../reduce/cpu/reduce.h"
 namespace op::causal_softmax::cpu {
 Descriptor::~Descriptor() {}
 infiniStatus_t Descriptor::create(
    infiniopHandle_t handle,
    Descriptor **desc_ptr,
    infiniopTensorDescriptor_t y_desc) {
-    CausalSoftmaxInfo info;
+    auto result = CausalSoftmaxInfo::create(y_desc);
-    CHECK_STATUS(createCausalSoftmaxInfo(&info, y_desc));
+    CHECK_RESULT(result);
-    *desc_ptr = new Descriptor(nullptr, info, 0, handle->device, handle->device_id);
+    *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id);
    return INFINI_STATUS_SUCCESS;
 }
@@ -53,9 +54,11 @@ infiniStatus_t causal_softmax(const CausalSoftmaxInfo *info, T *data) {
    return INFINI_STATUS_SUCCESS;
 }
-infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size,
    void *data,
-                                     void *stream) {
+    void *stream) const {
    if (_info.dtype == INFINI_DTYPE_F16) {
        CHECK_STATUS(causal_softmax<fp16_t>(&_info, (fp16_t *)data));
    } else if (_info.dtype == INFINI_DTYPE_F32) {

--- a/src/infiniop/ops/causal_softmax/info.h
+++ b/src/infiniop/ops/causal_softmax/info.h
+#ifndef __CAUSAL_SOFTMAX_INFO_H__
+#define __CAUSAL_SOFTMAX_INFO_H__
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+namespace op::causal_softmax {
+class CausalSoftmaxInfo {
+    CausalSoftmaxInfo() = default;
+public:
+    infiniDtype_t dtype;
+    size_t batch_size;
+    ptrdiff_t stride_b;
+    size_t seq_len;
+    ptrdiff_t stride_i;
+    size_t total_seq_len;
+    ptrdiff_t stride_j;
+    static utils::Result<CausalSoftmaxInfo> create(infiniopTensorDescriptor_t y_desc) {
+        auto dtype = y_desc->dtype();
+        if (y_desc->dtype() != INFINI_DTYPE_F16 && y_desc->dtype() != INFINI_DTYPE_F32) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+        if (y_desc->ndim() != 2 && y_desc->ndim() != 3) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+        if (y_desc->shape()[y_desc->ndim() - 1] < y_desc->shape()[y_desc->ndim() - 2]) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+        size_t batch_size = 1;
+        ptrdiff_t stride_b = 0;
+        size_t seq_len = y_desc->shape()[y_desc->ndim() - 2];
+        ptrdiff_t stride_i = y_desc->strides()[y_desc->ndim() - 2];
+        size_t total_seq_len = y_desc->shape()[y_desc->ndim() - 1];
+        ptrdiff_t stride_j = y_desc->strides()[y_desc->ndim() - 1];
+        if (y_desc->ndim() == 3) {
+            stride_b = y_desc->strides()[0];
+            batch_size = y_desc->shape()[0];
+        }
+        return utils::Result<CausalSoftmaxInfo>(CausalSoftmaxInfo{
+            dtype,
+            batch_size,
+            stride_b,
+            seq_len,
+            stride_i,
+            total_seq_len,
+            stride_j});
+    }
+};
+} // namespace op::causal_softmax
+#endif // __CAUSAL_SOFTMAX_INFO_H__
--- a/src/infiniop/ops/gemm/ascend/gemm_ascend.cc
+++ b/src/infiniop/ops/gemm/ascend/gemm_ascend.cc
@@ -38,14 +38,12 @@ infiniStatus_t Descriptor::create(
        return INFINI_STATUS_BAD_TENSOR_DTYPE;
    }
-    infiniStatus_t status;
+    auto result = MatmulInfo::create(c_desc, a_desc, b_desc, MatrixLayout::ROW_MAJOR);
-    auto info = MatmulInfo(c_desc, a_desc, b_desc, &status, MatrixLayout::ROW_MAJOR);
+    CHECK_RESULT(result);
-    if (status != INFINI_STATUS_SUCCESS) {
+    auto info = result.take();
-        return status;
-    }
    auto c = new aclnnTensorDescriptor(toAclDataType(c_desc->dtype()),
-                                       {static_cast<int64_t>(info.c_matrix.rows), static_cast<int64_t>(info.c_matrix.cols)},
+                                       {static_cast<int64_t>(info.m), static_cast<int64_t>(info.n)},
                                       {info.c_matrix.row_stride, info.c_matrix.col_stride});
    auto a = new aclnnTensorDescriptor(toAclDataType(a_desc->dtype()),
                                       {static_cast<int64_t>(info.a_matrix.rows), static_cast<int64_t>(info.a_matrix.cols)},

--- a/src/infiniop/ops/gemm/bang/gemm_bang.cc
+++ b/src/infiniop/ops/gemm/bang/gemm_bang.cc
@@ -71,11 +71,9 @@ infiniStatus_t Descriptor::create(
        return INFINI_STATUS_BAD_TENSOR_DTYPE;
    }
-    infiniStatus_t status;
+    auto result = MatmulInfo::create(c_desc, a_desc, b_desc, MatrixLayout::ROW_MAJOR);
-    auto info = MatmulInfo(c_desc, a_desc, b_desc, &status, MatrixLayout::ROW_MAJOR);
+    CHECK_RESULT(result);
-    if (status != INFINI_STATUS_SUCCESS) {
+    auto info = result.take();
-        return status;
-    }
    cnnlTensorDescriptor_t a, b, c;
    CHECK_BANG(cnnlCreateTensorDescriptor(&a));

--- a/src/infiniop/ops/gemm/blas.h
+++ b/src/infiniop/ops/gemm/blas.h
-#ifndef __BLAS_H__
-#define __BLAS_H__
-#include "../../operator.h"
-#include "../../tensor.h"
-#include <algorithm>
-namespace op::gemm {
-struct BlasMatrix {
-    size_t ndim;
-    size_t batch;
-    ptrdiff_t stride;
-    size_t rows;
-    size_t cols;
-    ptrdiff_t row_stride;
-    ptrdiff_t col_stride;
-    BlasMatrix() = default;
-    BlasMatrix(infiniopTensorDescriptor_t layout, infiniStatus_t *status) {
-        if (layout->ndim() == 2) {
-            ndim = 2;
-            batch = 1;
-            stride = 0;
-            rows = layout->dim(0);
-            cols = layout->dim(1);
-            row_stride = layout->stride(0);
-            col_stride = layout->stride(1);
-        } else if (layout->ndim() == 3) {
-            ndim = 3;
-            batch = layout->dim(0);
-            stride = batch == 1 ? 0 : layout->stride(0);
-            rows = layout->dim(1);
-            cols = layout->dim(2);
-            row_stride = layout->stride(1);
-            col_stride = layout->stride(2);
-        } else {
-            *status = INFINI_STATUS_BAD_TENSOR_SHAPE;
-            return;
-        }
-        if (row_stride != 1 && col_stride != 1) {
-            *status = INFINI_STATUS_BAD_TENSOR_STRIDES;
-            return;
-        }
-        *status = INFINI_STATUS_SUCCESS;
-    }
-    bool match_batch(size_t _batch) const {
-        return batch == _batch || batch == 1;
-    }
-    void transpose() {
-        std::swap(rows, cols);
-        std::swap(row_stride, col_stride);
-    }
-    ptrdiff_t ld() const {
-        return row_stride == 1 ? col_stride : row_stride;
-    }
-};
-enum class MatrixLayout : char {
-    COL_MAJOR,
-    ROW_MAJOR,
-};
-struct MatmulInfo {
-    BlasMatrix a_matrix;
-    BlasMatrix b_matrix;
-    BlasMatrix c_matrix;
-    size_t m, n, k, batch;
-    bool is_transed = false;
-    MatmulInfo(infiniopTensorDescriptor_t c_desc,
-               infiniopTensorDescriptor_t a_desc,
-               infiniopTensorDescriptor_t b_desc,
-               infiniStatus_t *status,
-               MatrixLayout layout) {
-        a_matrix = BlasMatrix(a_desc, status);
-        if (*status != INFINI_STATUS_SUCCESS) {
-            return;
-        }
-        b_matrix = BlasMatrix(b_desc, status);
-        if (*status != INFINI_STATUS_SUCCESS) {
-            return;
-        }
-        c_matrix = BlasMatrix(c_desc, status);
-        if (*status != INFINI_STATUS_SUCCESS) {
-            return;
-        }
-        if (c_matrix.rows != a_matrix.rows || c_matrix.cols != b_matrix.cols || a_matrix.cols != b_matrix.rows) {
-            *status = INFINI_STATUS_BAD_TENSOR_SHAPE;
-            return;
-        }
-        batch = c_matrix.batch;
-        if (!a_matrix.match_batch(batch) || !b_matrix.match_batch(batch)) {
-            *status = INFINI_STATUS_BAD_TENSOR_SHAPE;
-            return;
-        }
-        if ((layout == MatrixLayout::COL_MAJOR && c_matrix.col_stride == 1)
-            || (layout == MatrixLayout::ROW_MAJOR && c_matrix.row_stride == 1)) {
-            c_matrix.transpose();
-            b_matrix.transpose();
-            a_matrix.transpose();
-            std::swap(a_matrix, b_matrix);
-            is_transed = true;
-        }
-        m = c_matrix.rows;
-        n = c_matrix.cols;
-        k = a_matrix.cols;
-    }
-};
-} // namespace op::gemm
-#endif // __BLAS_H__