Merge pull request #45 from YdrMaster/main

issue/52 代码格式化：机制和效果

Merge pull request #45 from YdrMaster/main
issue/52 代码格式化：机制和效果
3c31dc6c · PanZezhong1725 · GitHub · 16dad776 · e5ed9fa1 · 3c31dc6c
Unverified Commit 3c31dc6c authored Feb 18, 2025 by PanZezhong1725 Committed by GitHub Feb 18, 2025
20 changed files
--- a/.clang-format
+++ b/.clang-format
-# Generated from CLion C/C++ Code Style settings
+---
 BasedOnStyle: LLVM
-AccessModifierOffset: -4
+IndentWidth: 4                        # 缩进宽度，LLVM 默认值为 2，改为 4
-AlignAfterOpenBracket: Align
+AccessModifierOffset: -4              # public/protected/private 访问控制符相对成员的偏移，与 IndentWidth 配合，LLVM 默认值为 -2
-# AlignConsecutiveAssignments: None
+AlignOperands: AlignAfterOperator     # 双目运算符的行间对齐，LLVM 默认值为 Align，改为带符号一起换行
-AlignOperands: Align
+BreakBeforeBinaryOperators: All       # 在双目运算符之前换行，LLVM 默认值为 None，改为换行时总是把双目运算符放在行首，包括赋值（=）
-AllowAllArgumentsOnNextLine: false
+ColumnLimit: 0                        # 列宽限制，LLVM 默认值为 80，改为不限制
-AllowAllConstructorInitializersOnNextLine: false
+AllowShortBlocksOnASingleLine: Always # 是否允许短块（单个语句的块）不换行，LLVM 默认值为 Never，改为允许
-AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortLoopsOnASingleLine: true    # 是否允许短循环不换行，LLVM 默认值为 false，改为允许
-AllowShortBlocksOnASingleLine: Always
+InsertBraces: true                    # 是否在 if/for/while/switch 等语句后插入大括号，LLVM 默认值为 false，改为允许
-AllowShortCaseLabelsOnASingleLine: false
+BreakBeforeBraces: Custom             # 大括号换行配置，LLVM 默认值为 LLVM，改为自定义以使 BraceWrapping 生效
-AllowShortFunctionsOnASingleLine: All
-AllowShortIfStatementsOnASingleLine: Always
-AllowShortLambdasOnASingleLine: All
-AllowShortLoopsOnASingleLine: true
-AlwaysBreakAfterReturnType: None
-AlwaysBreakTemplateDeclarations: No
-BreakBeforeBraces: Custom
 BraceWrapping:
  AfterCaseLabel: false
  AfterClass: false
@@ -23,44 +16,15 @@ BraceWrapping:
  AfterEnum: false
  AfterFunction: false
  AfterNamespace: false
+  AfterObjCDeclaration: false
+  AfterStruct: false
  AfterUnion: false
+  AfterExternBlock: false
  BeforeCatch: false
  BeforeElse: false
+  BeforeLambdaBody: false
+  BeforeWhile: false
  IndentBraces: false
-  SplitEmptyFunction: false
+  SplitEmptyFunction: true
  SplitEmptyRecord: true
-BreakBeforeBinaryOperators: None
+  SplitEmptyNamespace: true
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializers: BeforeColon
-BreakInheritanceList: BeforeColon
-ColumnLimit: 0
-CompactNamespaces: true
-ContinuationIndentWidth: 4
-IndentCaseLabels: true
-IndentPPDirectives: None
-IndentWidth: 4
-KeepEmptyLinesAtTheStartOfBlocks: true
-MaxEmptyLinesToKeep: 2
-NamespaceIndentation: All
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: true
-PointerAlignment: Right
-ReflowComments: false
-SpaceAfterCStyleCast: true
-SpaceAfterLogicalNot: false
-SpaceAfterTemplateKeyword: false
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeCpp11BracedList: false
-SpaceBeforeCtorInitializerColon: true
-SpaceBeforeInheritanceColon: true
-SpaceBeforeParens: ControlStatements
-SpaceBeforeRangeBasedForLoopColon: true
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 0
-SpacesInAngles: false
-SpacesInCStyleCastParentheses: false
-SpacesInContainerLiterals: false
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-TabWidth: 4
-UseTab: Never
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
+name: Build and test
+on:
+  pull_request:
+  push:
+    paths-ignore:
+      - '**.md'
+      - 'LICENSE'
+jobs:
+  build:
+    name: Build
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        type: [debug, release]
+    steps:
+    - name: checkout code
+      uses: actions/checkout@v4
+    - name: install black
+      run: pip install black
+    - name: check format
+      run: python3 scripts/format.py --path src --check
+    - name: install xmake
+      uses: xmake-io/github-action-setup-xmake@v1
+      with:
+        xmake-version: latest
+    - name: configure xmake
+      run: xmake f -cv
+    - name: build with xmake
+      run: xmake build && xmake install
--- a/README.md
+++ b/README.md
 # InfiniCore
-InfiniCore是一个跨平台统一编程工具集，为不同芯片平台的功能（包括计算、运行时、通信等）提供统一 C 语言接口。目前支持的芯片包括CPU、英伟达GPU、华为昇腾NPU、寒武纪MLU、摩尔线程GPU、天数智芯GPU、沐曦GPU、曙光DCU、昆仑芯。
+InfiniCore 是一个跨平台统一编程工具集，为不同芯片平台的功能（包括计算、运行时、通信等）提供统一 C 语言接口。目前支持的硬件和后端包括：
-## 一、使用说明
+- CPU；
+- CUDA
+  - 英伟达 GPU；
+  - 摩尔线程 GPU；
+  - 天数智芯 GPU；
+  - 沐曦 GPU；
+  - 曙光 DCU；
+- 华为昇腾 NPU；
+- 寒武纪 MLU；
+- 昆仑芯 XPU；
-### 1. 配置
+## 配置和使用
-#### 查看当前配置
+1. 项目配置
-```xmake
+   - 查看当前配置
-xmake f -v
-```
-#### 配置 CPU （默认配置）
+     ```shell
+     xmake f -v
+     ```
-```xmake
+   - 配置 CPU（默认配置）
-xmake f -cv
-```
-#### 配置加速卡
+     ```shell
+     xmake f -cv
+     ```
-```xmake
+   - 配置加速卡
-# 英伟达
-# 可以指定 CUDA 路径， 一般环境变量为 `CUDA_HOME` 或者 `CUDA_ROOT`
-xmake f --nv-gpu=true --cuda=$CUDA_HOME -cv
-# 寒武纪
+     ```shell
-xmake f --cambricon-mlu=true -cv
+     # 英伟达
+     # 可以指定 CUDA 路径， 一般环境变量为 `CUDA_HOME` 或者 `CUDA_ROOT`
+     xmake f --nv-gpu=true --cuda=$CUDA_HOME -cv
-# 华为昇腾
+     # 寒武纪
-xmake f --ascend-npu=true -cv
+     xmake f --cambricon-mlu=true -cv
-```
-### 2. 编译安装
+     # 华为昇腾
+     xmake f --ascend-npu=true -cv
+     ```
-```xmake
+2. 编译安装
-xmake build && xmake install
-# 默认安装路径为 $HOME/.infini
+   默认安装路径为 `$HOME/.infini`。
-```
+   ```shell
+   xmake build && xmake install
+   ```
+3. 设置环境变量
+   按输出提示设置 `INFINI_ROOT` 和 `LD_LIBRARY_PATH` 环境变量。
+4. 运行算子测试
-### 3. 设置环境变量
+   ```shell
+   python test/infiniop/[operator].py [--cpu | --nvidia | --cambricon | --ascend]
+   ```
-按输出提示设置 `INFINI_ROOT` 和 `LD_LIBRARY_PATH` 环境变量。
+## 开发指南
-### 4. 运行算子测试
+### 代码格式化
-```bash
+本项目使用 [`scripts/format.py`](/scripts/format.py) 脚本实现代码格式化检查和操作。
-python test/infiniop/[operator].py [--cpu | --nvidia | --cambricon | --ascend]
+使用
+```shell
+python scripts/format.py -h
+```
+查看脚本帮助信息：
+```plaintext
+usage: format.py [-h] [--ref REF] [--path [PATH ...]] [--check] [--c C] [--py PY]
+options:
+  -h, --help         show this help message and exit
+  --ref REF          Git reference (commit hash) to compare against.
+  --path [PATH ...]  Files to format or check.
+  --check            Check files without modifying them.
+  --c C              C formatter (default: clang-format-16)
+  --py PY            Python formatter (default: black)
 ```
+参数中：
+- `ref` 和 `path` 控制格式化的文件范围
+  - 若 `ref` 和 `path` 都为空，格式化当前暂存（git added）的文件；
+  - 否则
+    - 若 `ref` 非空，将比较指定 commit 和当前代码的差异，只格式化修改过的文件；
+    - 若 `path` 非空，可传入多个路径（`--path p0 p1 p2`），只格式化指定路径及其子目录中的文件；
+- 若设置 `--check`，将检查代码是否需要修改格式，不修改文件内容；
+- 通过 `--c` 指定 c/c++ 格式化器，默认为 `clang-format-16`；
+- 通过 `--python` 指定 python 格式化器 `black`；
--- a/include/infinicore.h
+++ b/include/infinicore.h
@@ -6,8 +6,7 @@
 #define __INFINICORE_EXPORT_C__
 #if defined(_WIN32)
 #define __export __declspec(dllexport)
-#elif defined(__GNUC__) &&                                                     \
+#elif defined(__GNUC__) && ((__GNUC__ >= 4) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3))
-    ((__GNUC__ >= 4) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3))
 #define __export __attribute__((visibility("default")))
 #else
 #define __export

--- a/include/infiniop/ops/causal_softmax.h
+++ b/include/infiniop/ops/causal_softmax.h
@@ -19,5 +19,4 @@ __C __export infiniopStatus_t infiniopCausalSoftmax(infiniopCausalSoftmaxDescrip
 __C __export infiniopStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxDescriptor_t desc);
 #endif
--- a/include/infiniop/ops/conv.h
+++ b/include/infiniop/ops/conv.h
@@ -21,5 +21,4 @@ __C __export infiniopStatus_t infiniopConv(infiniopConvDescriptor_t desc, void *
 __C __export infiniopStatus_t infiniopDestroyConvDescriptor(infiniopConvDescriptor_t desc);
 #endif
--- a/include/infiniop/ops/random_sample.h
+++ b/include/infiniop/ops/random_sample.h
@@ -22,5 +22,4 @@ __C __export infiniopStatus_t infiniopRandomSample(infiniopRandomSampleDescripto
 __C __export infiniopStatus_t infiniopDestroyRandomSampleDescriptor(infiniopRandomSampleDescriptor_t desc);
 #endif
--- a/include/infiniop/tensor_descriptor.h
+++ b/include/infiniop/tensor_descriptor.h
@@ -21,4 +21,4 @@ __C __export infiniopStatus_t infiniopCreateTensorDescriptor(infiniopTensorDescr
 __C __export infiniopStatus_t infiniopDestroyTensorDescriptor(infiniopTensorDescriptor_t desc);
-#endif// __INFINIOP_TENSOR_DESCRIPTOR__
+#endif // __INFINIOP_TENSOR_DESCRIPTOR__
--- a/scripts/format.py
+++ b/scripts/format.py
+import argparse
+import subprocess
+import os
+from pathlib import Path
+from colorama import Fore, Style
+# 支持的文件类型
+SUPPORTED_FILES = {
+    ".h": "c",
+    ".hh": "c",
+    ".hpp": "c",
+    ".c": "c",
+    ".cc": "c",
+    ".cpp": "c",
+    ".cxx": "c",
+    ".cu": "c",
+    ".cuh": "c",
+    ".mlu": "c",
+    ".cl": "c",
+    ".py": "py",
+}
+def format_file(file: Path, check: bool, formatter) -> bool:
+    formatter = formatter.get(SUPPORTED_FILES.get(file.suffix, None), None)
+    if not formatter:
+        return True  # 文件类型不支持，跳过
+    try:
+        cmd = []
+        if formatter.startswith("clang-format"):
+            cmd = [formatter, "-style=file", "-i", file]
+            if check:
+                cmd.insert(2, "-dry-run")
+                process = subprocess.run(
+                    cmd,
+                    capture_output=True,
+                    text=True,
+                    check=True,
+                )
+                if process.stderr:
+                    print(f"{Fore.YELLOW}{file} is not formatted.{Style.RESET_ALL}")
+                    print(
+                        f"Use {Fore.CYAN}{formatter} -style=file -i {file}{Style.RESET_ALL} to format it."
+                    )
+                    return False
+            else:
+                subprocess.run(
+                    cmd,
+                    capture_output=True,
+                    text=True,
+                    check=True,
+                )
+                print(f"{Fore.CYAN}Formatted: {file}{Style.RESET_ALL}")
+        elif formatter == "black":
+            cmd = [formatter, file]
+            if check:
+                cmd.insert(1, "--check")
+                process = subprocess.run(
+                    cmd,
+                    capture_output=True,
+                    text=True,
+                    check=True,
+                )
+                if process.stderr:
+                    print(f"{Fore.YELLOW}{file} is not formatted.{Style.RESET_ALL}")
+                    print(
+                        f"Use {Fore.CYAN}{formatter} {file}{Style.RESET_ALL} to format it."
+                    )
+                    return False
+            else:
+                subprocess.run(
+                    cmd,
+                    capture_output=True,
+                    text=True,
+                    check=True,
+                )
+                print(f"{Fore.CYAN}Formatted: {file}{Style.RESET_ALL}")
+    except FileNotFoundError:
+        print(
+            f"{Fore.RED}Formatter {formatter} not found, {file} skipped.{Style.RESET_ALL}"
+        )
+    except subprocess.CalledProcessError as e:
+        print(f"{Fore.RED}Formatter {formatter} failed: {e}{Style.RESET_ALL}")
+    return True
+def git_added_files():
+    """获取所有已暂存更改的文件"""
+    try:
+        # 使用 git diff --cached --name-only 获取所有已添加到暂存区的文件
+        result = subprocess.run(
+            ["git", "diff", "--cached", "--name-only"],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        for file in result.stdout.splitlines():
+            yield Path(file.strip())
+    except subprocess.CalledProcessError as e:
+        print(f"{Fore.RED}Git diff failed: {e}{Style.RESET_ALL}")
+def git_modified_since_ref(ref):
+    """获取从指定的 Git 引用到当前状态的修改文件列表"""
+    try:
+        result = subprocess.run(
+            ["git", "diff", f"{ref}..", "--diff-filter=AMR", "--name-only"],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        for file in result.stdout.splitlines():
+            yield Path(file.strip())
+    except subprocess.CalledProcessError as e:
+        print(f"{Fore.RED}Git diff failed: {e}{Style.RESET_ALL}")
+def list_files(paths):
+    """递归获取指定路径下的所有文件"""
+    files = []
+    for path in paths:
+        if path.is_file():
+            yield path
+        elif path.is_dir():
+            for dirpath, _, filenames in os.walk(path):
+                for name in filenames:
+                    yield Path(dirpath) / name
+        else:
+            print(
+                f"{Fore.RED}Error: {path} is not a file or directory.{Style.RESET_ALL}"
+            )
+def filter_in_path(file: Path, path) -> bool:
+    """判断文件是否在指定路径下"""
+    for p in path:
+        if file.is_relative_to(p):
+            return True
+    return False
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ref", type=str, help="Git reference (commit hash) to compare against."
+    )
+    parser.add_argument(
+        "--path", nargs="*", type=Path, help="Files to format or check."
+    )
+    parser.add_argument(
+        "--check", action="store_true", help="Check files without modifying them."
+    )
+    parser.add_argument(
+        "--c", default="clang-format-16", help="C formatter (default: clang-format-16)"
+    )
+    parser.add_argument(
+        "--py", default="black", help="Python formatter (default: black)"
+    )
+    args = parser.parse_args()
+    if args.ref is None and args.path is None:
+        # Last commit.
+        print("{Fore.GREEN}Formating git added files.{Style.RESET_ALL}")
+        files = git_added_files()
+    else:
+        if args.ref is None:
+            print(f"{Fore.GREEN}Formating files in {args.path}.{Style.RESET_ALL}")
+            files = list_files(args.path)
+        elif args.path is None:
+            print(
+                f"{Fore.GREEN}Formating git modified files from {args.ref}.{Style.RESET_ALL}"
+            )
+            files = git_modified_since_ref(args.ref)
+        else:
+            print(
+                f"{Fore.GREEN}Formating git modified files from {args.ref} in {args.path}.{Style.RESET_ALL}"
+            )
+            files = (
+                file
+                for file in git_modified_since_ref(args.ref)
+                if filter_in_path(file, args.path)
+            )
+    formatted = True
+    for file in files:
+        if not format_file(
+            file,
+            args.check,
+            {
+                "c": args.c,
+                "py": args.py,
+            },
+        ):
+            formatted = False
+    if not formatted:
+        exit(1)
+if __name__ == "__main__":
+    main()
--- a/src/infiniop/devices/ascend/common_ascend.cc
+++ b/src/infiniop/devices/ascend/common_ascend.cc
@@ -31,35 +31,35 @@ infiniopStatus_t freeWorkspace(void *workspaceAddr) {
 }
 aclDataType toAclDataType(infiniDtype_t dt) {
-    if (dt == INFINI_DTYPE_I8)
+    if (dt == INFINI_DTYPE_I8) {
        return aclDataType::ACL_INT8;
-    else if (dt == INFINI_DTYPE_I16)
+    } else if (dt == INFINI_DTYPE_I16) {
        return aclDataType::ACL_INT16;
-    else if (dt == INFINI_DTYPE_I32)
+    } else if (dt == INFINI_DTYPE_I32) {
        return aclDataType::ACL_INT32;
-    else if (dt == INFINI_DTYPE_I64)
+    } else if (dt == INFINI_DTYPE_I64) {
        return aclDataType::ACL_INT64;
-    else if (dt == INFINI_DTYPE_U8)
+    } else if (dt == INFINI_DTYPE_U8) {
        return aclDataType::ACL_UINT8;
-    else if (dt == INFINI_DTYPE_U16)
+    } else if (dt == INFINI_DTYPE_U16) {
        return aclDataType::ACL_UINT16;
-    else if (dt == INFINI_DTYPE_U32)
+    } else if (dt == INFINI_DTYPE_U32) {
        return aclDataType::ACL_UINT32;
-    else if (dt == INFINI_DTYPE_U64)
+    } else if (dt == INFINI_DTYPE_U64) {
        return aclDataType::ACL_UINT64;
-    else if (dt == INFINI_DTYPE_F16)
+    } else if (dt == INFINI_DTYPE_F16) {
        return aclDataType::ACL_FLOAT16;
-    else if (dt == INFINI_DTYPE_BF16)
+    } else if (dt == INFINI_DTYPE_BF16) {
        return aclDataType::ACL_BF16;
-    else if (dt == INFINI_DTYPE_F32)
+    } else if (dt == INFINI_DTYPE_F32) {
        return aclDataType::ACL_FLOAT;
-    else if (dt == INFINI_DTYPE_F64)
+    } else if (dt == INFINI_DTYPE_F64) {
        return aclDataType::ACL_DOUBLE;
-    else
+    } else {
        return aclDataType::ACL_DT_UNDEFINED;
+    }
 }
 const char *dataTypeToString(aclDataType dtype) {
    switch (dtype) {
    case ACL_DT_UNDEFINED:

--- a/src/infiniop/devices/ascend/common_ascend.h
+++ b/src/infiniop/devices/ascend/common_ascend.h
@@ -34,7 +34,6 @@ extern "C" {
        return INFINIOP_STATUS_INTERNAL_ERROR; \
    } while (0)
 #ifdef __cplusplus
 };
 #endif

--- a/src/infiniop/devices/ascend/tensor_aclnn.cc
+++ b/src/infiniop/devices/ascend/tensor_aclnn.cc
@@ -21,7 +21,6 @@ infiniopStatus_t aclnnTensorDescriptor::setDescriptor(aclDataType dtype, const s
    return INFINIOP_STATUS_SUCCESS;
 }
 /// @brief Infer storage shape. For now this ruturns a 1D shape of the total tensor storage size.
 /// We don't see why higher dimensional storage shape is ever needed. To change if necesary.
 infiniopStatus_t aclnnTensorDescriptor::inferStorageShape() {
@@ -93,8 +92,10 @@ char *aclnnTensorDescriptor::toString() {
    // Assume bufferSize
    size_t bufferSize = 1024 + this->ndim * 40 + this->storageNdim * 40;
-    char *buffer = (char *) malloc(bufferSize);
+    char *buffer = (char *)malloc(bufferSize);
-    if (!buffer) return NULL;
+    if (!buffer) {
+        return NULL;
+    }
    // Write info into buffer
    char *ptr = buffer;

--- a/src/infiniop/devices/cpu/common_cpu.cc
+++ b/src/infiniop/devices/cpu/common_cpu.cc
@@ -37,8 +37,7 @@ uint16_t f32_to_f16(float val) {
    uint32_t f32;
    memcpy(&f32, &val, sizeof(f32));               // Read the bits of the float32
    uint16_t sign = (f32 >> 16) & 0x8000;          // Extract the sign bit
-    int32_t exponent =
+    int32_t exponent = ((f32 >> 23) & 0xFF) - 127; // Extract and de-bias the exponent
-        ((f32 >> 23) & 0xFF) - 127;     // Extract and de-bias the exponent
    uint32_t mantissa = f32 & 0x7FFFFF;            // Extract the mantissa (fraction part)
    if (exponent >= 31) { // Special cases for Inf and NaN

--- a/src/infiniop/devices/cpu/common_cpu.h
+++ b/src/infiniop/devices/cpu/common_cpu.h
@@ -27,4 +27,4 @@ size_t getPaddedSize(size_t ndim, size_t *shape, size_t const *pads);
 // calculate the padded shape and store the result in padded_shape
 std::vector<size_t> getPaddedShape(size_t ndim, size_t const *shape, size_t const *pads);
-#endif// __INFINIOP__COMMON_CPU_H__
+#endif // __INFINIOP__COMMON_CPU_H__
--- a/src/infiniop/devices/cuda/common_cuda.cuh
+++ b/src/infiniop/devices/cuda/common_cuda.cuh
@@ -47,18 +47,18 @@ struct InfiniopCudaHandle {
    int compute_capability_minor;
 };
-template<typename T>
+template <typename T>
 void use_cublas(std::shared_ptr<Pool<cublasHandle_t>> cublas_handle_pool, int device_id, cudaStream_t stream, T const &f) {
    auto handle = cublas_handle_pool->pop();
    if (!handle) {
        cublasCreate(&(*handle));
    }
-    cublasSetStream(*handle, (cudaStream_t) stream);
+    cublasSetStream(*handle, (cudaStream_t)stream);
    f(*handle);
    cublas_handle_pool->push(std::move(*handle));
 }
-template<typename T>
+template <typename T>
 cudnnStatus_t use_cudnn(std::shared_ptr<Pool<cudnnHandle_t>> cudnn_handle_pool, int device_id, cudaStream_t stream, T const &f) {
    auto handle = cudnn_handle_pool->pop();
    if (!handle) {
@@ -118,4 +118,4 @@ inline __device__ __host__ size_t indexToOffset(size_t flat_index, size_t ndim,
    return res;
 }
-#endif// __INFINIOP_COMMON_CUDA_H__
+#endif // __INFINIOP_COMMON_CUDA_H__
--- a/src/infiniop/devices/pool.h
+++ b/src/infiniop/devices/pool.h
@@ -5,7 +5,7 @@
 #include <mutex>
 #include <optional>
-template<class T>
+template <class T>
 class Pool {
 public:
    Pool() : _head(nullptr) {}
@@ -21,7 +21,7 @@ public:
    void push(T &&val) const {
        Node<T> *new_node = new Node<T>(std::move(val));
        new_node->next = _head.load();
-        while (!_head.compare_exchange_weak(new_node->next, new_node));
+        while (!_head.compare_exchange_weak(new_node->next, new_node)) {}
    }
    std::optional<T> pop() const {
@@ -37,7 +37,7 @@ public:
    }
 private:
-    template<class U>
+    template <class U>
    struct Node {
        U data;
        Node<U> *next;

--- a/src/infiniop/ops/causal_softmax/operator.cc
+++ b/src/infiniop/ops/causal_softmax/operator.cc
@@ -7,33 +7,33 @@ __C infiniopStatus_t infiniopCreateCausalSoftmaxDescriptor(
    switch (handle->device) {
 #ifdef ENABLE_CPU
    case DevCpu:
-            return cpuCreateCausalSoftmaxDescriptor(handle, (CausalSoftmaxCpuDescriptor_t *) desc_ptr, y_desc);
+        return cpuCreateCausalSoftmaxDescriptor(handle, (CausalSoftmaxCpuDescriptor_t *)desc_ptr, y_desc);
 #endif
 #ifdef ENABLE_NV_GPU
    case DevNvGpu: {
-            return cudaCreateCausalSoftmaxDescriptor((CudaHandle_t)handle, (CausalSoftmaxCudaDescriptor_t *) desc_ptr, y_desc);
+        return cudaCreateCausalSoftmaxDescriptor((CudaHandle_t)handle, (CausalSoftmaxCudaDescriptor_t *)desc_ptr, y_desc);
    }
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {
-            return bangCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxBangDescriptor_t *) desc_ptr, y_desc);
+        return bangCreateCausalSoftmaxDescriptor((BangHandle_t)handle, (CausalSoftmaxBangDescriptor_t *)desc_ptr, y_desc);
        // return cnnlCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxCnnlDescriptor_t *) desc_ptr, y_desc);
    }
 #endif
 #ifdef ENABLE_ASCEND_NPU
    case DevAscendNpu: {
-            return aclnnCreateCausalSoftmaxDescriptor((AscendHandle_t) handle, (CausalSoftmaxAclnnDescriptor_t *) desc_ptr, y_desc);
+        return aclnnCreateCausalSoftmaxDescriptor((AscendHandle_t)handle, (CausalSoftmaxAclnnDescriptor_t *)desc_ptr, y_desc);
    }
 #endif
 #ifdef ENABLE_METAX_GPU
    case DevMetaxGpu: {
-            return macaCreateCausalSoftmaxDescriptor((MacaHandle_t) handle, (CausalSoftmaxMacaDescriptor_t *) desc_ptr, y_desc);
+        return macaCreateCausalSoftmaxDescriptor((MacaHandle_t)handle, (CausalSoftmaxMacaDescriptor_t *)desc_ptr, y_desc);
    }
 #endif
 #ifdef ENABLE_MTHREADS_GPU
    case DevMthreadsGpu: {
-            return musaCreateCausalSoftmaxDescriptor((MusaHandle_t) handle, (CausalSoftmaxMusaDescriptor_t *) desc_ptr, y_desc);
+        return musaCreateCausalSoftmaxDescriptor((MusaHandle_t)handle, (CausalSoftmaxMusaDescriptor_t *)desc_ptr, y_desc);
    }
 #endif
    }
@@ -44,34 +44,34 @@ __C infiniopStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmax
    switch (desc->device) {
 #ifdef ENABLE_CPU
    case DevCpu:
-            return cpuGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCpuDescriptor_t) desc, size);
+        return cpuGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCpuDescriptor_t)desc, size);
 #endif
 #ifdef ENABLE_NV_GPU
    case DevNvGpu: {
-            return cudaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCudaDescriptor_t) desc, size);
+        return cudaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCudaDescriptor_t)desc, size);
    }
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {
-            return bangGetCausalSoftmaxWorkspaceSize((CausalSoftmaxBangDescriptor_t) desc, size);
+        return bangGetCausalSoftmaxWorkspaceSize((CausalSoftmaxBangDescriptor_t)desc, size);
        // return cnnlGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCnnlDescriptor_t) desc, size);
    }
 #endif
 #ifdef ENABLE_ASCEND_NPU
    case DevAscendNpu: {
-            return aclnnGetCausalSoftmaxWorkspaceSize((CausalSoftmaxAclnnDescriptor_t) desc, size);
+        return aclnnGetCausalSoftmaxWorkspaceSize((CausalSoftmaxAclnnDescriptor_t)desc, size);
    }
 #endif
 #ifdef ENABLE_METAX_GPU
    case DevMetaxGpu: {
-            return macaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMacaDescriptor_t) desc, size);
+        return macaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMacaDescriptor_t)desc, size);
    }
 #endif
 #ifdef ENABLE_MTHREADS_GPU
    case DevMthreadsGpu: {
-            return musaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMusaDescriptor_t) desc, size);
+        return musaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMusaDescriptor_t)desc, size);
    }
 #endif
    }
@@ -82,33 +82,33 @@ __C infiniopStatus_t infiniopCausalSoftmax(infiniopCausalSoftmaxDescriptor_t des
    switch (desc->device) {
 #ifdef ENABLE_CPU
    case DevCpu:
-            return cpuCausalSoftmax((CausalSoftmaxCpuDescriptor_t) desc, workspace, workspace_size, data, stream);
+        return cpuCausalSoftmax((CausalSoftmaxCpuDescriptor_t)desc, workspace, workspace_size, data, stream);
 #endif
 #ifdef ENABLE_NV_GPU
    case DevNvGpu: {
-            return cudaCausalSoftmax((CausalSoftmaxCudaDescriptor_t) desc, workspace, workspace_size, data, stream);
+        return cudaCausalSoftmax((CausalSoftmaxCudaDescriptor_t)desc, workspace, workspace_size, data, stream);
    }
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {
-            return bangCausalSoftmax((CausalSoftmaxBangDescriptor_t) desc, workspace, workspace_size, data, stream);
+        return bangCausalSoftmax((CausalSoftmaxBangDescriptor_t)desc, workspace, workspace_size, data, stream);
        // return cnnlCausalSoftmax((CausalSoftmaxCnnlDescriptor_t) desc, workspace, workspace_size, data, stream);
    }
 #endif
 #ifdef ENABLE_ASCEND_NPU
    case DevAscendNpu: {
-            return aclnnCausalSoftmax((CausalSoftmaxAclnnDescriptor_t) desc, workspace, workspace_size, data, stream);
+        return aclnnCausalSoftmax((CausalSoftmaxAclnnDescriptor_t)desc, workspace, workspace_size, data, stream);
    }
 #endif
 #ifdef ENABLE_METAX_GPU
    case DevMetaxGpu: {
-            return macaCausalSoftmax((CausalSoftmaxMacaDescriptor_t) desc, workspace, workspace_size, data, stream);
+        return macaCausalSoftmax((CausalSoftmaxMacaDescriptor_t)desc, workspace, workspace_size, data, stream);
    }
 #endif
 #ifdef ENABLE_MTHREADS_GPU
    case DevMthreadsGpu: {
-            return musaCausalSoftmax((CausalSoftmaxMusaDescriptor_t) desc, workspace, workspace_size, data, stream);
+        return musaCausalSoftmax((CausalSoftmaxMusaDescriptor_t)desc, workspace, workspace_size, data, stream);
    }
 #endif
    }
@@ -119,33 +119,33 @@ __C infiniopStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftma
    switch (desc->device) {
 #ifdef ENABLE_CPU
    case DevCpu:
-            return cpuDestroyCausalSoftmaxDescriptor((CausalSoftmaxCpuDescriptor_t) desc);
+        return cpuDestroyCausalSoftmaxDescriptor((CausalSoftmaxCpuDescriptor_t)desc);
 #endif
 #ifdef ENABLE_NV_GPU
    case DevNvGpu: {
-            return cudaDestroyCausalSoftmaxDescriptor((CausalSoftmaxCudaDescriptor_t) desc);
+        return cudaDestroyCausalSoftmaxDescriptor((CausalSoftmaxCudaDescriptor_t)desc);
    }
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {
-            return bangDestroyCausalSoftmaxDescriptor((CausalSoftmaxBangDescriptor_t) desc);
+        return bangDestroyCausalSoftmaxDescriptor((CausalSoftmaxBangDescriptor_t)desc);
        // return cnnlDestroyCausalSoftmaxDescriptor((CausalSoftmaxCnnlDescriptor_t) desc);
    }
 #endif
 #ifdef ENABLE_ASCEND_NPU
    case DevAscendNpu: {
-            return aclnnDestroyCausalSoftmaxDescriptor((CausalSoftmaxAclnnDescriptor_t) desc);
+        return aclnnDestroyCausalSoftmaxDescriptor((CausalSoftmaxAclnnDescriptor_t)desc);
    }
 #endif
 #ifdef ENABLE_METAX_GPU
    case DevMetaxGpu: {
-            return macaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMacaDescriptor_t) desc);
+        return macaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMacaDescriptor_t)desc);
    }
 #endif
 #ifdef ENABLE_MTHREADS_GPU
    case DevMthreadsGpu:
-            return musaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMusaDescriptor_t) desc);
+        return musaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMusaDescriptor_t)desc);
 #endif
    }
    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;

--- a/src/infiniop/ops/matmul/ascend/matmul_aclnn.cc
+++ b/src/infiniop/ops/matmul/ascend/matmul_aclnn.cc
@@ -123,17 +123,13 @@ infiniopStatus_t aclnnMatmul(MatmulAclnnDescriptor_t desc, void *workspace,
    for (size_t i = 0; i < batch; i++) {
        AclSetTensorAddr(desc->executor, 0, ta,
-                         (char *)(a) + i * desc->info->a_matrix.stride *
+                         (char *)(a) + i * desc->info->a_matrix.stride * infiniSizeof(desc->dtype));
-                                           infiniSizeof(desc->dtype));
        AclSetTensorAddr(desc->executor, 1, tb,
-                         (char *)(b) + i * desc->info->b_matrix.stride *
+                         (char *)(b) + i * desc->info->b_matrix.stride * infiniSizeof(desc->dtype));
-                                           infiniSizeof(desc->dtype));
        AclSetTensorAddr(desc->executor, 2, tc,
-                         (char *)(c) + i * desc->info->c_matrix.stride *
+                         (char *)(c) + i * desc->info->c_matrix.stride * infiniSizeof(desc->dtype));
-                                           infiniSizeof(desc->dtype));
        AclSetTensorAddr(desc->executor, 3, tc,
-                         (char *)(c) + i * desc->info->c_matrix.stride *
+                         (char *)(c) + i * desc->info->c_matrix.stride * infiniSizeof(desc->dtype));
-                                           infiniSizeof(desc->dtype));
        ret = aclnnGemm(workspace, workspaceSize, desc->executor, stream);
        CHECK_RET(ret == ACL_SUCCESS,
                  LOG_PRINT("aclnnGemm failed. ERROR: %d\n", ret);

--- a/src/infiniop/ops/matmul/bang/matmul_cnnl.cc
+++ b/src/infiniop/ops/matmul/bang/matmul_cnnl.cc
--- a/src/infiniop/ops/matmul/blas.h
+++ b/src/infiniop/ops/matmul/blas.h
@@ -88,7 +88,7 @@ struct MatmulInfo {
            return;
        }
-        if (c_matrix.rows != a_matrix.rows || c_matrix.cols != b_matrix.cols || a_matrix.cols != b_matrix.rows){
+        if (c_matrix.rows != a_matrix.rows || c_matrix.cols != b_matrix.cols || a_matrix.cols != b_matrix.rows) {
            *status = INFINIOP_STATUS_BAD_TENSOR_SHAPE;
            return;
        }
@@ -113,4 +113,4 @@ struct MatmulInfo {
    }
 };
-#endif// __BLAS_H__
+#endif // __BLAS_H__