Merge pull request #45 from YdrMaster/main

issue/52 代码格式化：机制和效果

Merge pull request #45 from YdrMaster/main
issue/52 代码格式化：机制和效果
3c31dc6c · PanZezhong1725 · GitHub · 16dad776 · e5ed9fa1 · 3c31dc6c
Unverified Commit 3c31dc6c authored Feb 18, 2025 by PanZezhong1725 Committed by GitHub Feb 18, 2025
20 changed files
--- a/src/infiniop/ops/matmul/cpu/matmul_cpu.cc
+++ b/src/infiniop/ops/matmul/cpu/matmul_cpu.cc
@@ -38,8 +38,8 @@ cpuDestroyMatmulDescriptor(infiniopMatmulCpuDescriptor_t desc) {

 template <typename Tdata>
 infiniopStatus_t cpuCalculateMatmul(infiniopMatmulCpuDescriptor_t desc, void *c,
-                            float beta, void const *a, void const *b,
-                            float alpha) {
+                                    float beta, void const *a, void const *b,
+                                    float alpha) {
    auto info = desc->info;

    if (info.is_transed) {
@@ -49,20 +49,11 @@ infiniopStatus_t cpuCalculateMatmul(infiniopMatmulCpuDescriptor_t desc, void *c,
    for (size_t i = 0; i < info.batch; ++i) {
        for (size_t m_ = 0; m_ < info.m; ++m_) {
            for (size_t n_ = 0; n_ < info.n; ++n_) {
-                auto c_ = reinterpret_cast<Tdata *>(c) +
-                          i * info.c_matrix.stride +
-                          m_ * info.c_matrix.row_stride +
-                          n_ * info.c_matrix.col_stride;
+                auto c_ = reinterpret_cast<Tdata *>(c) + i * info.c_matrix.stride + m_ * info.c_matrix.row_stride + n_ * info.c_matrix.col_stride;
                float sum = 0;
                for (size_t k_ = 0; k_ < info.k; ++k_) {
-                    auto a_ = reinterpret_cast<Tdata const *>(a) +
-                              i * info.a_matrix.stride +
-                              m_ * info.a_matrix.row_stride +
-                              k_ * info.a_matrix.col_stride;
-                    auto b_ = reinterpret_cast<Tdata const *>(b) +
-                              i * info.b_matrix.stride +
-                              n_ * info.b_matrix.col_stride +
-                              k_ * info.b_matrix.row_stride;
+                    auto a_ = reinterpret_cast<Tdata const *>(a) + i * info.a_matrix.stride + m_ * info.a_matrix.row_stride + k_ * info.a_matrix.col_stride;
+                    auto b_ = reinterpret_cast<Tdata const *>(b) + i * info.b_matrix.stride + n_ * info.b_matrix.col_stride + k_ * info.b_matrix.row_stride;
                    if constexpr (std::is_same<Tdata, uint16_t>::value) {
                        sum += f16_to_f32(*a_) * f16_to_f32(*b_);
                    } else {

--- a/src/infiniop/ops/matmul/cuda/matmul_cuda.cu
+++ b/src/infiniop/ops/matmul/cuda/matmul_cuda.cu
-#include "./matmul_cuda.cuh"
 #include "../../utils.h"
+#include "./matmul_cuda.cuh"

 infiniopStatus_t cudaCreateMatmulDescriptor(infiniopCudaHandle_t handle,
                                            infiniopMatmulCudaDescriptor_t *desc_ptr,

--- a/src/infiniop/ops/matmul/cuda/matmul_cuda.cuh
+++ b/src/infiniop/ops/matmul/cuda/matmul_cuda.cuh
 #ifndef __INFINIOP_MATMUL_CUDA_H__
 #define __INFINIOP_MATMUL_CUDA_H__

-#include "matmul_cuda_api.h"
 #include "../../../devices/cuda/common_cuda.cuh"
-#include <memory>
 #include "../blas.h"
+#include "matmul_cuda_api.h"
+#include <memory>

 typedef struct InfiniopMatmulCudaDescriptor {
    infiniDevice_t device;
@@ -14,4 +14,4 @@ typedef struct InfiniopMatmulCudaDescriptor {
    std::shared_ptr<Pool<cublasHandle_t>> cublas_handle_pool;
 } InfiniopMatmulCudaDescriptor;

-#endif// __INFINIOP_MATMUL_CUDA_H__
+#endif // __INFINIOP_MATMUL_CUDA_H__
--- a/src/infiniop/ops/matmul/cuda/matmul_cuda_api.h
+++ b/src/infiniop/ops/matmul/cuda/matmul_cuda_api.h
@@ -4,7 +4,6 @@
 #include "../../../devices/cuda/cuda_handle.h"
 #include "infiniop/operator.h"

-
 struct InfiniopMatmulCudaDescriptor;
 typedef struct InfiniopMatmulCudaDescriptor *infiniopMatmulCudaDescriptor_t;

@@ -28,5 +27,4 @@ infiniopStatus_t cudaMatmul(infiniopMatmulCudaDescriptor_t desc,

 infiniopStatus_t cudaDestroyMatmulDescriptor(infiniopMatmulCudaDescriptor_t desc);

-
 #endif // __INFINIOP_MATMUL_CUDA_API_H__
--- a/src/infiniop/ops/matmul/cuda/matmul_cuda_kernel.cu
+++ b/src/infiniop/ops/matmul/cuda/matmul_cuda_kernel.cu
 #include "../../utils.h"
 #include "./matmul_cuda.cuh"

-template<typename Tdata>
+template <typename Tdata>
 infiniopStatus_t cudaMatmulCublas(infiniopMatmulCudaDescriptor_t desc, void *c, float beta, void const *a, void const *b, float alpha, void *stream) {
    auto info = desc->info;

@@ -26,7 +26,7 @@ infiniopStatus_t cudaMatmulCublas(infiniopMatmulCudaDescriptor_t desc, void *c,
    auto op_a = info.a_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;
    auto op_b = info.b_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;

-    use_cublas(desc->cublas_handle_pool, desc->device_id, (cudaStream_t) stream,
+    use_cublas(desc->cublas_handle_pool, desc->device_id, (cudaStream_t)stream,
               [&](cublasHandle_t handle) { cublasGemmStridedBatchedEx(
                                                handle,
                                                op_a,

--- a/src/infiniop/ops/random_sample/operator.cc
+++ b/src/infiniop/ops/random_sample/operator.cc
@@ -3,36 +3,36 @@
 __C infiniopStatus_t infiniopCreateRandomSampleDescriptor(infiniopHandle_t handle, infiniopRandomSampleDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result, infiniopTensorDescriptor_t probs) {
    switch (handle->device) {
 #ifdef ENABLE_CPU
-        case DevCpu:
-            return cpuCreateRandomSampleDescriptor(handle, (RandomSampleCpuDescriptor_t *) desc_ptr, result, probs);
+    case DevCpu:
+        return cpuCreateRandomSampleDescriptor(handle, (RandomSampleCpuDescriptor_t *)desc_ptr, result, probs);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu:
-            return cudaCreateRandomSampleDescriptor((CudaHandle_t) handle, (RandomSampleCudaDescriptor_t *) desc_ptr, result, probs);
+    case DevNvGpu:
+        return cudaCreateRandomSampleDescriptor((CudaHandle_t)handle, (RandomSampleCudaDescriptor_t *)desc_ptr, result, probs);
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu: {
-            return bangCreateRandomSampleDescriptor((BangHandle_t) handle,
-                                                    (RandomSampleBangDescriptor_t *) desc_ptr, result,
-                                                    probs);
-        }
+    case DevCambriconMlu: {
+        return bangCreateRandomSampleDescriptor((BangHandle_t)handle,
+                                                (RandomSampleBangDescriptor_t *)desc_ptr, result,
+                                                probs);
+    }
 #endif
 #ifdef ENABLE_ASCEND_NPU
-        case DevAscendNpu: {
-            return ascendCreateRandomSampleDescriptor((AscendHandle_t) handle,
-                                                     (RandomSampleAscendDescriptor_t *) desc_ptr, result, probs);
-        }
+    case DevAscendNpu: {
+        return ascendCreateRandomSampleDescriptor((AscendHandle_t)handle,
+                                                  (RandomSampleAscendDescriptor_t *)desc_ptr, result, probs);
+    }
 #endif
 #ifdef ENABLE_METAX_GPU
-        case DevMetaxGpu: {
-            return macaCreateRandomSampleDescriptor((MacaHandle_t) handle,
-                                                    (RandomSampleMacaDescriptor_t *) desc_ptr, result,
-                                                    probs);
-        }
+    case DevMetaxGpu: {
+        return macaCreateRandomSampleDescriptor((MacaHandle_t)handle,
+                                                (RandomSampleMacaDescriptor_t *)desc_ptr, result,
+                                                probs);
+    }
 #endif
 #ifdef ENABLE_MTHREADS_GPU
-        case DevMthreadsGpu:
-            return musaCreateRandomSampleDescriptor((MusaHandle_t) handle, (RandomSampleMusaDescriptor_t *) desc_ptr, result, probs);
+    case DevMthreadsGpu:
+        return musaCreateRandomSampleDescriptor((MusaHandle_t)handle, (RandomSampleMusaDescriptor_t *)desc_ptr, result, probs);
 #endif
    }
    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -41,35 +41,35 @@ __C infiniopStatus_t infiniopCreateRandomSampleDescriptor(infiniopHandle_t handl
 __C infiniopStatus_t infiniopGetRandomSampleWorkspaceSize(infiniopRandomSampleDescriptor_t desc, uint64_t *size) {
    switch (desc->device) {
 #ifdef ENABLE_CPU
-        case DevCpu:
-            return cpuGetRandomSampleWorkspaceSize((RandomSampleCpuDescriptor_t) desc, size);
+    case DevCpu:
+        return cpuGetRandomSampleWorkspaceSize((RandomSampleCpuDescriptor_t)desc, size);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu: {
-            return cudaGetRandomSampleWorkspaceSize((RandomSampleCudaDescriptor_t) desc, size);
-        }
+    case DevNvGpu: {
+        return cudaGetRandomSampleWorkspaceSize((RandomSampleCudaDescriptor_t)desc, size);
+    }

 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu: {
-            return bangGetRandomSampleWorkspaceSize((RandomSampleBangDescriptor_t) desc, size);
-            // return cnnlGetRandomSampleWorkspaceSize((RandomSampleCnnlDescriptor_t) desc, size);
-        }
+    case DevCambriconMlu: {
+        return bangGetRandomSampleWorkspaceSize((RandomSampleBangDescriptor_t)desc, size);
+        // return cnnlGetRandomSampleWorkspaceSize((RandomSampleCnnlDescriptor_t) desc, size);
+    }
 #endif
 #ifdef ENABLE_ASCEND_NPU
-        case DevAscendNpu: {
-            return ascendGetRandomSampleWorkspaceSize((RandomSampleAscendDescriptor_t) desc, size);
-        }
+    case DevAscendNpu: {
+        return ascendGetRandomSampleWorkspaceSize((RandomSampleAscendDescriptor_t)desc, size);
+    }
 #endif
 #ifdef ENABLE_METAX_GPU
-        case DevMetaxGpu: {
-            return macaGetRandomSampleWorkspaceSize((RandomSampleMacaDescriptor_t) desc, size);
-        }
+    case DevMetaxGpu: {
+        return macaGetRandomSampleWorkspaceSize((RandomSampleMacaDescriptor_t)desc, size);
+    }
 #endif
 #ifdef ENABLE_MTHREADS_GPU
-        case DevMthreadsGpu: {
-            return musaGetRandomSampleWorkspaceSize((RandomSampleMusaDescriptor_t) desc, size);
-        }
+    case DevMthreadsGpu: {
+        return musaGetRandomSampleWorkspaceSize((RandomSampleMusaDescriptor_t)desc, size);
+    }
 #endif
    }
    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -87,31 +87,31 @@ __C infiniopStatus_t infiniopRandomSample(infiniopRandomSampleDescriptor_t desc,
                                          void *stream) {
    switch (desc->device) {
 #ifdef ENABLE_CPU
-        case DevCpu:
-            return cpuRandomSample((RandomSampleCpuDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
+    case DevCpu:
+        return cpuRandomSample((RandomSampleCpuDescriptor_t)desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu:
-            return cudaRandomSample((RandomSampleCudaDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
+    case DevNvGpu:
+        return cudaRandomSample((RandomSampleCudaDescriptor_t)desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu: {
-            return bangRandomSample((RandomSampleBangDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
-        }
+    case DevCambriconMlu: {
+        return bangRandomSample((RandomSampleBangDescriptor_t)desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
+    }
 #endif
 #ifdef ENABLE_ASCEND_NPU
-        case DevAscendNpu: {
-            return ascendRandomSample((RandomSampleAscendDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
-        }
+    case DevAscendNpu: {
+        return ascendRandomSample((RandomSampleAscendDescriptor_t)desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
+    }
 #endif
 #ifdef ENABLE_METAX_GPU
-        case DevMetaxGpu: {
-            return macaRandomSample((RandomSampleMacaDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
-        }
+    case DevMetaxGpu: {
+        return macaRandomSample((RandomSampleMacaDescriptor_t)desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
+    }
 #endif
 #ifdef ENABLE_MTHREADS_GPU
-        case DevMthreadsGpu:
-            return musaRandomSample((RandomSampleMusaDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
+    case DevMthreadsGpu:
+        return musaRandomSample((RandomSampleMusaDescriptor_t)desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
 #endif
    }
    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -120,31 +120,31 @@ __C infiniopStatus_t infiniopRandomSample(infiniopRandomSampleDescriptor_t desc,
 __C infiniopStatus_t infiniopDestroyRandomSampleDescriptor(infiniopRandomSampleDescriptor_t desc) {
    switch (desc->device) {
 #ifdef ENABLE_CPU
-        case DevCpu:
-            return cpuDestroyRandomSampleDescriptor((RandomSampleCpuDescriptor_t) desc);
+    case DevCpu:
+        return cpuDestroyRandomSampleDescriptor((RandomSampleCpuDescriptor_t)desc);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu:
-            return cudaDestroyRandomSampleDescriptor((RandomSampleCudaDescriptor_t) desc);
+    case DevNvGpu:
+        return cudaDestroyRandomSampleDescriptor((RandomSampleCudaDescriptor_t)desc);
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu: {
-            return bangDestroyRandomSampleDescriptor((RandomSampleBangDescriptor_t) desc);
-        }
+    case DevCambriconMlu: {
+        return bangDestroyRandomSampleDescriptor((RandomSampleBangDescriptor_t)desc);
+    }
 #endif
 #ifdef ENABLE_ASCEND_NPU
-        case DevAscendNpu: {
-            return ascendDestroyRandomSampleDescriptor((RandomSampleAscendDescriptor_t) desc);
-        }
+    case DevAscendNpu: {
+        return ascendDestroyRandomSampleDescriptor((RandomSampleAscendDescriptor_t)desc);
+    }
 #endif
 #ifdef ENABLE_METAX_GPU
-        case DevMetaxGpu: {
-            return macaDestroyRandomSampleDescriptor((RandomSampleMacaDescriptor_t) desc);
-        }
+    case DevMetaxGpu: {
+        return macaDestroyRandomSampleDescriptor((RandomSampleMacaDescriptor_t)desc);
+    }
 #endif
 #ifdef ENABLE_MTHREADS_GPU
-        case DevMthreadsGpu:
-            return musaDestroyRandomSampleDescriptor((RandomSampleMusaDescriptor_t) desc);
+    case DevMthreadsGpu:
+        return musaDestroyRandomSampleDescriptor((RandomSampleMusaDescriptor_t)desc);
 #endif
    }
    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;

--- a/src/infiniop/ops/rearrange/operator.cc
+++ b/src/infiniop/ops/rearrange/operator.cc
@@ -7,37 +7,37 @@ __C infiniopStatus_t infiniopCreateRearrangeDescriptor(
    infiniopTensorDescriptor_t src) {
    switch (handle->device) {
 #ifdef ENABLE_CPU
-        case DevCpu:
-            return cpuCreateRearrangeDescriptor(handle, (RearrangeCpuDescriptor_t *) desc_ptr, dst, src);
+    case DevCpu:
+        return cpuCreateRearrangeDescriptor(handle, (RearrangeCpuDescriptor_t *)desc_ptr, dst, src);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu: {
-            return cudaCreateRearrangeDescriptor((CudaHandle_t) handle, (RearrangeCudaDescriptor_t *) desc_ptr, dst, src);
-        }
+    case DevNvGpu: {
+        return cudaCreateRearrangeDescriptor((CudaHandle_t)handle, (RearrangeCudaDescriptor_t *)desc_ptr, dst, src);
+    }

 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu: {
-            return bangCreateRearrangeDescriptor((BangHandle_t) handle, (RearrangeBangDescriptor_t *) desc_ptr, dst, src);
-        }
+    case DevCambriconMlu: {
+        return bangCreateRearrangeDescriptor((BangHandle_t)handle, (RearrangeBangDescriptor_t *)desc_ptr, dst, src);
+    }
 #endif
 #ifdef ENABLE_ASCEND_NPU
-        case DevAscendNpu: {
-            return aclnnCreateRearrangeDescriptor((AscendHandle_t) handle,
-                                                  (RearrangeAclnnDescriptor_t *) desc_ptr,
-                                                  dst,
-                                                  src);
-        }
+    case DevAscendNpu: {
+        return aclnnCreateRearrangeDescriptor((AscendHandle_t)handle,
+                                              (RearrangeAclnnDescriptor_t *)desc_ptr,
+                                              dst,
+                                              src);
+    }
 #endif
 #ifdef ENABLE_METAX_GPU
-        case DevMetaxGpu: {
-            return macaCreateRearrangeDescriptor((MacaHandle_t) handle, (RearrangeMacaDescriptor_t *) desc_ptr, dst, src);
-        }
+    case DevMetaxGpu: {
+        return macaCreateRearrangeDescriptor((MacaHandle_t)handle, (RearrangeMacaDescriptor_t *)desc_ptr, dst, src);
+    }
 #endif
 #ifdef ENABLE_MTHREADS_GPU
-        case DevMthreadsGpu: {
-            return musaCreateRearrangeDescriptor((MusaHandle_t)handle, (RearrangeMusaDescriptor_t *) desc_ptr, dst, src);
-        }
+    case DevMthreadsGpu: {
+        return musaCreateRearrangeDescriptor((MusaHandle_t)handle, (RearrangeMusaDescriptor_t *)desc_ptr, dst, src);
+    }
 #endif
    }
    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -46,37 +46,37 @@ __C infiniopStatus_t infiniopCreateRearrangeDescriptor(
 __C infiniopStatus_t infiniopRearrange(infiniopRearrangeDescriptor_t desc, void *dst, void const *src, void *stream) {
    switch (desc->device) {
 #ifdef ENABLE_CPU
-        case DevCpu:
-            return cpuRearrange((RearrangeCpuDescriptor_t) desc, dst, src, stream);
+    case DevCpu:
+        return cpuRearrange((RearrangeCpuDescriptor_t)desc, dst, src, stream);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu: {
-            return cudaRearrange((RearrangeCudaDescriptor_t) desc, dst, src, stream);
-        }
+    case DevNvGpu: {
+        return cudaRearrange((RearrangeCudaDescriptor_t)desc, dst, src, stream);
+    }

 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu: {
-            return bangRearrange((RearrangeBangDescriptor_t) desc, dst, src, stream);
-        }
+    case DevCambriconMlu: {
+        return bangRearrange((RearrangeBangDescriptor_t)desc, dst, src, stream);
+    }
 #endif
 #ifdef ENABLE_ASCEND_NPU
-        case DevAscendNpu: {
-            return aclnnRearrange((RearrangeAclnnDescriptor_t) desc,
-                                  dst,
-                                  src,
-                                  stream);
-        }
+    case DevAscendNpu: {
+        return aclnnRearrange((RearrangeAclnnDescriptor_t)desc,
+                              dst,
+                              src,
+                              stream);
+    }
 #endif
 #ifdef ENABLE_METAX_GPU
-        case DevMetaxGpu: {
-            return macaRearrange((RearrangeMacaDescriptor_t) desc, dst, src, stream);
-        }
+    case DevMetaxGpu: {
+        return macaRearrange((RearrangeMacaDescriptor_t)desc, dst, src, stream);
+    }
 #endif
 #ifdef ENABLE_MTHREADS_GPU
-        case DevMthreadsGpu: {
-            return musaRearrange((RearrangeMusaDescriptor_t) desc, dst, src, stream);
-        }
+    case DevMthreadsGpu: {
+        return musaRearrange((RearrangeMusaDescriptor_t)desc, dst, src, stream);
+    }
 #endif
    }
    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -85,34 +85,34 @@ __C infiniopStatus_t infiniopRearrange(infiniopRearrangeDescriptor_t desc, void
 __C infiniopStatus_t infiniopDestroyRearrangeDescriptor(infiniopRearrangeDescriptor_t desc) {
    switch (desc->device) {
 #ifdef ENABLE_CPU
-        case DevCpu:
-            return cpuDestroyRearrangeDescriptor((RearrangeCpuDescriptor_t) desc);
+    case DevCpu:
+        return cpuDestroyRearrangeDescriptor((RearrangeCpuDescriptor_t)desc);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu: {
-            return cudaDestroyRearrangeDescriptor((RearrangeCudaDescriptor_t) desc);
-        }
+    case DevNvGpu: {
+        return cudaDestroyRearrangeDescriptor((RearrangeCudaDescriptor_t)desc);
+    }

 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu: {
-            return bangDestroyRearrangeDescriptor((RearrangeBangDescriptor_t) desc);
-        }
+    case DevCambriconMlu: {
+        return bangDestroyRearrangeDescriptor((RearrangeBangDescriptor_t)desc);
+    }
 #endif
 #ifdef ENABLE_ASCEND_NPU
-        case DevAscendNpu: {
-            return aclnnDestroyRearrangeDescriptor((RearrangeAclnnDescriptor_t) desc);
-        }
+    case DevAscendNpu: {
+        return aclnnDestroyRearrangeDescriptor((RearrangeAclnnDescriptor_t)desc);
+    }
 #endif
 #ifdef ENABLE_METAX_GPU
-        case DevMetaxGpu: {
-            return macaDestroyRearrangeDescriptor((RearrangeMacaDescriptor_t) desc);
-        }
+    case DevMetaxGpu: {
+        return macaDestroyRearrangeDescriptor((RearrangeMacaDescriptor_t)desc);
+    }
 #endif
 #ifdef ENABLE_MTHREADS_GPU
-        case DevMthreadsGpu: {
-            return musaDestroyRearrangeDescriptor((RearrangeMusaDescriptor_t) desc);
-        }
+    case DevMthreadsGpu: {
+        return musaDestroyRearrangeDescriptor((RearrangeMusaDescriptor_t)desc);
+    }
 #endif
    }
    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;

--- a/src/infiniop/ops/rms_norm/operator.cc
+++ b/src/infiniop/ops/rms_norm/operator.cc
@@ -9,38 +9,38 @@ __C infiniopStatus_t infiniopCreateRMSNormDescriptor(
    float epsilon) {
    switch (handle->device) {
 #ifdef ENABLE_CPU
-        case DevCpu:
-            return cpuCreateRMSNormDescriptor(handle, (RMSNormCpuDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon);
+    case DevCpu:
+        return cpuCreateRMSNormDescriptor(handle, (RMSNormCpuDescriptor_t *)desc_ptr, y_desc, x_desc, w_desc, epsilon);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu: {
-            return cudaCreateRMSNormDescriptor((CudaHandle_t) handle, (RMSNormCudaDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon);
-        }
+    case DevNvGpu: {
+        return cudaCreateRMSNormDescriptor((CudaHandle_t)handle, (RMSNormCudaDescriptor_t *)desc_ptr, y_desc, x_desc, w_desc, epsilon);
+    }
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu: {
-            return bangCreateRMSNormDescriptor((BangHandle_t) handle, (RMSNormBangDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon);
-        }
+    case DevCambriconMlu: {
+        return bangCreateRMSNormDescriptor((BangHandle_t)handle, (RMSNormBangDescriptor_t *)desc_ptr, y_desc, x_desc, w_desc, epsilon);
+    }
 #endif
 #ifdef ENABLE_ASCEND_NPU
-        case DevAscendNpu: {
-            return aclnnCreateRMSNormDescriptor((AscendHandle_t) handle,
-                                                (RMSNormAclnnDescriptor_t *) desc_ptr,
-                                                y_desc,
-                                                x_desc,
-                                                w_desc,
-                                                epsilon);
-        }
+    case DevAscendNpu: {
+        return aclnnCreateRMSNormDescriptor((AscendHandle_t)handle,
+                                            (RMSNormAclnnDescriptor_t *)desc_ptr,
+                                            y_desc,
+                                            x_desc,
+                                            w_desc,
+                                            epsilon);
+    }
 #endif
 #ifdef ENABLE_METAX_GPU
-        case DevMetaxGpu: {
-            return macaCreateRMSNormDescriptor((MacaHandle_t) handle, (RMSNormMacaDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon);
-        }
+    case DevMetaxGpu: {
+        return macaCreateRMSNormDescriptor((MacaHandle_t)handle, (RMSNormMacaDescriptor_t *)desc_ptr, y_desc, x_desc, w_desc, epsilon);
+    }
 #endif
 #ifdef ENABLE_MTHREADS_GPU
-        case DevMthreadsGpu: {
-            return musaCreateRMSNormDescriptor((MusaHandle_t) handle, (RMSNormMusaDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon);
-        }
+    case DevMthreadsGpu: {
+        return musaCreateRMSNormDescriptor((MusaHandle_t)handle, (RMSNormMusaDescriptor_t *)desc_ptr, y_desc, x_desc, w_desc, epsilon);
+    }
 #endif
    }
    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -49,35 +49,35 @@ __C infiniopStatus_t infiniopCreateRMSNormDescriptor(
 __C infiniopStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t desc, uint64_t *size) {
    switch (desc->device) {
 #ifdef ENABLE_CPU
-        case DevCpu:
-            return cpuGetRMSNormWorkspaceSize((RMSNormCpuDescriptor_t) desc, size);
+    case DevCpu:
+        return cpuGetRMSNormWorkspaceSize((RMSNormCpuDescriptor_t)desc, size);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu: {
-            return cudaGetRMSNormWorkspaceSize((RMSNormCudaDescriptor_t) desc, size);
-        }
+    case DevNvGpu: {
+        return cudaGetRMSNormWorkspaceSize((RMSNormCudaDescriptor_t)desc, size);
+    }

 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu: {
-            return bangGetRMSNormWorkspaceSize((RMSNormBangDescriptor_t) desc, size);
-        }
+    case DevCambriconMlu: {
+        return bangGetRMSNormWorkspaceSize((RMSNormBangDescriptor_t)desc, size);
+    }
 #endif
 #ifdef ENABLE_ASCEND_NPU
-        case DevAscendNpu: {
-            return aclnnGetRMSNormWorkspaceSize((RMSNormAclnnDescriptor_t) desc,
-                                                size);
-        }
+    case DevAscendNpu: {
+        return aclnnGetRMSNormWorkspaceSize((RMSNormAclnnDescriptor_t)desc,
+                                            size);
+    }
 #endif
 #ifdef ENABLE_METAX_GPU
-        case DevMetaxGpu: {
-            return macaGetRMSNormWorkspaceSize((RMSNormMacaDescriptor_t) desc, size);
-        }
+    case DevMetaxGpu: {
+        return macaGetRMSNormWorkspaceSize((RMSNormMacaDescriptor_t)desc, size);
+    }
 #endif
 #ifdef ENABLE_MTHREADS_GPU
-        case DevMthreadsGpu: {
-            return musaGetRMSNormWorkspaceSize((RMSNormMusaDescriptor_t) desc, size);
-        }
+    case DevMthreadsGpu: {
+        return musaGetRMSNormWorkspaceSize((RMSNormMusaDescriptor_t)desc, size);
+    }
 #endif
    }
    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -87,40 +87,40 @@ __C infiniopStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *wor
                                     void *y, void const *x, void const *w, void *stream) {
    switch (desc->device) {
 #ifdef ENABLE_CPU
-        case DevCpu:
-            return cpuRMSNorm((RMSNormCpuDescriptor_t) desc, workspace, workspace_size, y, x, w, stream);
+    case DevCpu:
+        return cpuRMSNorm((RMSNormCpuDescriptor_t)desc, workspace, workspace_size, y, x, w, stream);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu: {
-            return cudaRMSNorm((RMSNormCudaDescriptor_t) desc, workspace, workspace_size, y, x, w, stream);
-        }
+    case DevNvGpu: {
+        return cudaRMSNorm((RMSNormCudaDescriptor_t)desc, workspace, workspace_size, y, x, w, stream);
+    }

 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu: {
-            return bangRMSNorm((RMSNormBangDescriptor_t) desc, workspace, workspace_size, y, x, w, stream);
-        }
+    case DevCambriconMlu: {
+        return bangRMSNorm((RMSNormBangDescriptor_t)desc, workspace, workspace_size, y, x, w, stream);
+    }
 #endif
 #ifdef ENABLE_ASCEND_NPU
-        case DevAscendNpu: {
-            return aclnnRMSNorm((RMSNormAclnnDescriptor_t) desc,
-                                workspace,
-                                workspace_size,
-                                y,
-                                x,
-                                w,
-                                stream);
-        }
+    case DevAscendNpu: {
+        return aclnnRMSNorm((RMSNormAclnnDescriptor_t)desc,
+                            workspace,
+                            workspace_size,
+                            y,
+                            x,
+                            w,
+                            stream);
+    }
 #endif
 #ifdef ENABLE_METAX_GPU
-        case DevMetaxGpu: {
-            return macaRMSNorm((RMSNormMacaDescriptor_t) desc, workspace, workspace_size, y, x, w, stream);
-        }
+    case DevMetaxGpu: {
+        return macaRMSNorm((RMSNormMacaDescriptor_t)desc, workspace, workspace_size, y, x, w, stream);
+    }
 #endif
 #ifdef ENABLE_MTHREADS_GPU
-        case DevMthreadsGpu: {
-            return musaRMSNorm((RMSNormMusaDescriptor_t) desc, workspace, workspace_size, y, x, w, stream);
-        }
+    case DevMthreadsGpu: {
+        return musaRMSNorm((RMSNormMusaDescriptor_t)desc, workspace, workspace_size, y, x, w, stream);
+    }
 #endif
    }
    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -129,34 +129,34 @@ __C infiniopStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *wor
 __C infiniopStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t desc) {
    switch (desc->device) {
 #ifdef ENABLE_CPU
-        case DevCpu:
-            return cpuDestroyRMSNormDescriptor((RMSNormCpuDescriptor_t) desc);
+    case DevCpu:
+        return cpuDestroyRMSNormDescriptor((RMSNormCpuDescriptor_t)desc);
 #endif
 #ifdef ENABLE_NV_GPU
-        case DevNvGpu: {
-            return cudaDestroyRMSNormDescriptor((RMSNormCudaDescriptor_t) desc);
-        }
+    case DevNvGpu: {
+        return cudaDestroyRMSNormDescriptor((RMSNormCudaDescriptor_t)desc);
+    }

 #endif
 #ifdef ENABLE_CAMBRICON_MLU
-        case DevCambriconMlu: {
-            return bangDestroyRMSNormDescriptor((RMSNormBangDescriptor_t) desc);
-        }
+    case DevCambriconMlu: {
+        return bangDestroyRMSNormDescriptor((RMSNormBangDescriptor_t)desc);
+    }
 #endif
 #ifdef ENABLE_ASCEND_NPU
-        case DevAscendNpu: {
-            return aclnnDestroyRMSNormDescriptor((RMSNormAclnnDescriptor_t) desc);
-        }
+    case DevAscendNpu: {
+        return aclnnDestroyRMSNormDescriptor((RMSNormAclnnDescriptor_t)desc);
+    }
 #endif
 #ifdef ENABLE_METAX_GPU
-        case DevMetaxGpu: {
-            return macaDestroyRMSNormDescriptor((RMSNormMacaDescriptor_t) desc);
-        }
+    case DevMetaxGpu: {
+        return macaDestroyRMSNormDescriptor((RMSNormMacaDescriptor_t)desc);
+    }
 #endif
 #ifdef ENABLE_MTHREADS_GPU
-        case DevMthreadsGpu: {
-            return musaDestroyRMSNormDescriptor((RMSNormMusaDescriptor_t) desc);
-        }
+    case DevMthreadsGpu: {
+        return musaDestroyRMSNormDescriptor((RMSNormMusaDescriptor_t)desc);
+    }
 #endif
    }
    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;

--- a/src/infiniop/ops/utils.h
+++ b/src/infiniop/ops/utils.h
@@ -13,28 +13,28 @@

 #define ROUND_UP_DIV(x, y) ((x + y - 1) / y)

-#define CHECK_ERROR(call, target, errCode)                                     \
-    do {                                                                       \
-        if (auto value = (call); value == (target)) {                          \
-            std::cerr << "Error: expected " << (target) << " but got "         \
-                      << value << " in file " << __FILE__ << ", function "     \
-                      << __func__ << ", line " << __LINE__ << std::endl;       \
-            return (errCode);                                                  \
-        }                                                                      \
+#define CHECK_ERROR(call, target, errCode)                                 \
+    do {                                                                   \
+        if (auto value = (call); value == (target)) {                      \
+            std::cerr << "Error: expected " << (target) << " but got "     \
+                      << value << " in file " << __FILE__ << ", function " \
+                      << __func__ << ", line " << __LINE__ << std::endl;   \
+            return (errCode);                                              \
+        }                                                                  \
    } while (0)

-#define CREATE_CHECK_ERROR(expr, value, target, errCode)                       \
-    expr;                                                                      \
+#define CREATE_CHECK_ERROR(expr, value, target, errCode) \
+    expr;                                                \
    CHECK_ERROR(value, target, errCode)

-#define CHECK_STATUS(call, target)                                             \
-    do {                                                                       \
-        if (auto value = (call); value != (target)) {                          \
-            std::cerr << "Error: expected " << (target) << " but got "         \
-                      << value << " in file " << __FILE__ << ", function "     \
-                      << __func__ << ", line " << __LINE__ << std::endl;       \
-            return value;                                                      \
-        }                                                                      \
+#define CHECK_STATUS(call, target)                                         \
+    do {                                                                   \
+        if (auto value = (call); value != (target)) {                      \
+            std::cerr << "Error: expected " << (target) << " but got "     \
+                      << value << " in file " << __FILE__ << ", function " \
+                      << __func__ << ", line " << __LINE__ << std::endl;   \
+            return value;                                                  \
+        }                                                                  \
    } while (0)

 inline std::vector<int64_t> getByteStrides(infiniopTensorDescriptor_t desc) {
@@ -67,8 +67,7 @@ inline bool getBroadcastShape(const uint64_t *shape1, uint64_t ndim1,

    // compute broadcasted shape
    for (size_t i = 0; i < max_rank; ++i) {
-        if (padded_shape1[i] == padded_shape2[i] || padded_shape1[i] == 1 ||
-            padded_shape2[i] == 1) {
+        if (padded_shape1[i] == padded_shape2[i] || padded_shape1[i] == 1 || padded_shape2[i] == 1) {
            broadcast_shape[i] = std::max(padded_shape1[i], padded_shape2[i]);
        } else {
            return false;
@@ -89,10 +88,7 @@ inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a,
    auto broadcast_shape = broadcast_shape_.data(),
         padded_shape1 = padded_shape1_.data(),
         padded_shape2 = padded_shape2_.data();
-    if (broadcast_ndim != c->ndim ||
-        !getBroadcastShape(a->shape, a->ndim, b->shape, b->ndim,
-                           broadcast_shape, padded_shape1, padded_shape2,
-                           broadcast_ndim)) {
+    if (broadcast_ndim != c->ndim || !getBroadcastShape(a->shape, a->ndim, b->shape, b->ndim, broadcast_shape, padded_shape1, padded_shape2, broadcast_ndim)) {
        return false;
    }
    return std::equal(broadcast_shape, broadcast_shape + broadcast_ndim,
@@ -126,7 +122,6 @@ inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a,
    return isValidBroadcastShape(a, b, c, std::max(a->ndim, b->ndim));
 }

-
 // permute the dimensions of a tensor descriptor
 inline infiniopTensorDescriptor_t permute(infiniopTensorDescriptor_t desc,
                                          const std::vector<size_t> &order) {
@@ -149,10 +144,9 @@ inline infiniopTensorDescriptor_t permute(infiniopTensorDescriptor_t desc,
 // check if the dimensions [dim_start, dim_end] of a tensor descriptor are
 // contiguous
 inline bool isContiguous(const infiniopTensorDescriptor_t &desc,
-                          size_t dim_start, size_t dim_end) {
+                         size_t dim_start, size_t dim_end) {
    for (size_t i = dim_start + 1; i <= dim_end; i++) {
-        if (desc->strides[i - 1] !=
-            static_cast<int64_t>(desc->shape[i]) * desc->strides[i]) {
+        if (desc->strides[i - 1] != static_cast<int64_t>(desc->shape[i]) * desc->strides[i]) {
            return false;
        }
    }
@@ -168,7 +162,7 @@ inline bool isContiguous(const infiniopTensorDescriptor_t &desc) {

 // merge the dimensions [dim_start, dim_end] of a tensor descriptor
 inline infiniopTensorDescriptor_t dimMerge(infiniopTensorDescriptor_t desc,
-                                            size_t dim_start, size_t dim_end) {
+                                           size_t dim_start, size_t dim_end) {
    size_t ndim = desc->ndim;
    if (dim_start > dim_end || dim_end >= ndim) {
        return nullptr;
@@ -203,11 +197,10 @@ inline infiniopTensorDescriptor_t dimMerge(infiniopTensorDescriptor_t desc,

 // split the dimension dim of a tensor descriptor into multiple dimensions
 inline infiniopTensorDescriptor_t dimSplit(infiniopTensorDescriptor_t desc,
-                                            size_t dim,
-                                            const std::vector<size_t> &dims) {
+                                           size_t dim,
+                                           const std::vector<size_t> &dims) {
    size_t ndim = desc->ndim;
-    if (desc->shape[dim] != std::accumulate(dims.begin(), dims.end(), (size_t)1,
-                                            std::multiplies{})) {
+    if (desc->shape[dim] != std::accumulate(dims.begin(), dims.end(), (size_t)1, std::multiplies{})) {
        return nullptr;
    }
    size_t new_ndim = ndim + dims.size() - 1;
@@ -221,10 +214,7 @@ inline infiniopTensorDescriptor_t dimSplit(infiniopTensorDescriptor_t desc,
    }
    for (size_t i = 0; i < dims.size(); i++) {
        new_shape[index] = dims[i];
-        new_strides[index] =
-            desc->strides[dim] * desc->shape[dim] /
-            std::accumulate(dims.begin(), dims.begin() + i + 1, (size_t)1,
-                            std::multiplies<size_t>());
+        new_strides[index] = desc->strides[dim] * desc->shape[dim] / std::accumulate(dims.begin(), dims.begin() + i + 1, (size_t)1, std::multiplies<size_t>());
        index++;
    }
    for (size_t i = dim + 1; i < ndim; i++) {

--- a/test/infiniop/__init__.py
+++ b/test/infiniop/__init__.py
-import libinfiniop
\ No newline at end of file
+import libinfiniop
--- a/test/infiniop/add.py
+++ b/test/infiniop/add.py
@@ -41,8 +41,8 @@ def test(
    lib,
    handle,
    torch_device,
-    c_shape, 
-    a_shape, 
+    c_shape,
+    a_shape,
    b_shape,
    tensor_dtype=torch.float16,
    inplace=Inplace.OUT_OF_PLACE,
@@ -56,13 +56,21 @@ def test(

    a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device)
    b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device)
-    c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b)
+    c = (
+        torch.rand(c_shape, dtype=tensor_dtype).to(torch_device)
+        if inplace == Inplace.OUT_OF_PLACE
+        else (a if inplace == Inplace.INPLACE_A else b)
+    )

    ans = add(a, b)

    a_tensor = to_tensor(a, lib)
    b_tensor = to_tensor(b, lib)
-    c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
+    c_tensor = (
+        to_tensor(c, lib)
+        if inplace == Inplace.OUT_OF_PLACE
+        else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
+    )
    descriptor = infiniopAddDescriptor_t()

    check_error(
@@ -91,8 +99,10 @@ def test_cpu(lib, test_cases):
    device = DeviceEnum.DEVICE_CPU
    handle = create_handle(lib, device)
    for c_shape, a_shape, b_shape, inplace in test_cases:
+        # fmt: off
        test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
        test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+        # fmt: on
    destroy_handle(lib, handle)


@@ -100,8 +110,10 @@ def test_cuda(lib, test_cases):
    device = DeviceEnum.DEVICE_CUDA
    handle = create_handle(lib, device)
    for c_shape, a_shape, b_shape, inplace in test_cases:
+        # fmt: off
        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
        test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+        # fmt: on
    destroy_handle(lib, handle)


@@ -111,13 +123,16 @@ def test_bang(lib, test_cases):
    device = DeviceEnum.DEVICE_BANG
    handle = create_handle(lib, device)
    for c_shape, a_shape, b_shape, inplace in test_cases:
+        # fmt: off
        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace)
        test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace)
+        # fmt: on
    destroy_handle(lib, handle)


 if __name__ == "__main__":
    test_cases = [
+        # fmt: off
        # c_shape, a_shape, b_shape, inplace
        # ((32, 150, 512000), (32, 150, 512000), (32, 150, 512000), Inplace.OUT_OF_PLACE),
        # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE),
@@ -133,6 +148,7 @@ if __name__ == "__main__":
        ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE),
        ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE),
        ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE),
+        # fmt: on
    ]
    args = get_args()
    lib = open_lib()

--- a/test/infiniop/avg_pool.py
+++ b/test/infiniop/avg_pool.py
@@ -35,7 +35,7 @@ class AvgPoolDescriptor(Structure):
 infiniopAvgPoolDescriptor_t = POINTER(AvgPoolDescriptor)


-def pool(x, k, padding, stride, dilation = 1):
+def pool(x, k, padding, stride, dilation=1):
    pooling_layers = {
        1: torch.nn.AvgPool1d,
        2: torch.nn.AvgPool2d,
@@ -48,7 +48,9 @@ def pool(x, k, padding, stride, dilation = 1):
        return None

    if ndim == 3 and x.dtype == torch.float16:
-        ans = pooling_layers[ndim](k, stride=stride, padding=padding)(x.to(torch.float32)).to(torch.float16)
+        ans = pooling_layers[ndim](k, stride=stride, padding=padding)(
+            x.to(torch.float32)
+        ).to(torch.float16)
    else:
        ans = pooling_layers[ndim](k, stride=stride, padding=padding)(x)
    if PROFILE:
@@ -69,18 +71,20 @@ def inferShape(x_shape, kernel_shape, padding, strides):

    return x_shape[:2] + tuple(output_shape)

+
 # convert a python tuple to a ctype void pointer
 def tuple_to_void_p(py_tuple: Tuple):
    array = ctypes.c_int64 * len(py_tuple)
    data_array = array(*py_tuple)
    return ctypes.cast(data_array, ctypes.c_void_p)

+
 def test(
    lib,
    handle,
    torch_device,
-    x_shape, 
-    k_shape, 
+    x_shape,
+    k_shape,
    padding,
    strides,
    tensor_dtype=torch.float16,
@@ -90,7 +94,9 @@ def test(
    )

    x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device)
-    y = torch.rand(inferShape(x_shape, k_shape, padding, strides), dtype=tensor_dtype).to(torch_device)
+    y = torch.rand(
+        inferShape(x_shape, k_shape, padding, strides), dtype=tensor_dtype
+    ).to(torch_device)

    for i in range(NUM_PRERUN if PROFILE else 1):
        ans = pool(x, k_shape, padding, strides)
@@ -126,7 +132,9 @@ def test(
    check_error(
        lib.infiniopGetAvgPoolWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
    )
-    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device)
+    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(
+        torch_device
+    )
    workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))

    for i in range(NUM_PRERUN if PROFILE else 1):
@@ -164,8 +172,10 @@ def test_cpu(lib, test_cases):
    device = DeviceEnum.DEVICE_CPU
    handle = create_handle(lib, device)
    for x_shape, kernel_shape, padding, strides in test_cases:
+        # fmt: off
        test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
        test(lib, handle, "cpu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
+        # fmt: on
    destroy_handle(lib, handle)


@@ -173,8 +183,10 @@ def test_cuda(lib, test_cases):
    device = DeviceEnum.DEVICE_CUDA
    handle = create_handle(lib, device)
    for x_shape, kernel_shape, padding, strides in test_cases:
+        # fmt: off
        test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
        test(lib, handle, "cuda", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
+        # fmt: on
    destroy_handle(lib, handle)


@@ -184,17 +196,21 @@ def test_bang(lib, test_cases):
    device = DeviceEnum.DEVICE_BANG
    handle = create_handle(lib, device)
    for x_shape, kernel_shape, padding, strides in test_cases:
+        # fmt: off
        test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float16)
        test(lib, handle, "mlu", x_shape, kernel_shape, padding, strides, tensor_dtype=torch.float32)
+        # fmt: on
    destroy_handle(lib, handle)


 if __name__ == "__main__":
    test_cases = [
+        # fmt: off
        # x_shape, kernel_shape, padding, strides
        ((1, 1, 10), (3,), (1,), (1,)),
        ((32, 3, 224, 224), (3, 3), (1, 1), (2, 2)),
        ((1, 1, 16, 16, 16), (5, 5, 5), (2, 2, 2), (2, 2, 2)),
+        # fmt: on
    ]
    args = get_args()
    lib = open_lib()

--- a/test/infiniop/causal_softmax.py
+++ b/test/infiniop/causal_softmax.py
@@ -101,6 +101,7 @@ def test_bang(lib, test_cases):
        test(lib, handle, "mlu", x_shape, x_stride)
    destroy_handle(lib, handle)

+
 def test_ascend(lib, test_cases):
    import torch_npu

@@ -111,11 +112,12 @@ def test_ascend(lib, test_cases):

    destroy_handle(lib, handle)

+
 if __name__ == "__main__":
    test_cases = [
        # x_shape, x_stride
        ((32, 20, 512), None),
-        ((32, 20, 512), (20480, 512, 1)), # Ascend 暂不支持非连续
+        ((32, 20, 512), (20480, 512, 1)),  # Ascend 暂不支持非连续
    ]
    args = get_args()
    lib = open_lib()

--- a/test/infiniop/conv.py
+++ b/test/infiniop/conv.py
@@ -41,17 +41,11 @@ infiniopConvDescriptor_t = POINTER(ConvDescriptor)
 def conv(x, w, stride, padding, dilation):
    match len(x.shape) - 2:
        case 1:
-            return F.conv1d(
-                x, w, stride=stride, padding=padding, dilation=dilation
-            )
+            return F.conv1d(x, w, stride=stride, padding=padding, dilation=dilation)
        case 2:
-            return F.conv2d(
-                x, w, stride=stride, padding=padding, dilation=dilation
-            )
+            return F.conv2d(x, w, stride=stride, padding=padding, dilation=dilation)
        case 3:
-            return F.conv3d(
-                x, w, stride=stride, padding=padding, dilation=dilation
-            )
+            return F.conv3d(x, w, stride=stride, padding=padding, dilation=dilation)
        case _:
            print("Error: Pytorch -> Unsupported tensor dimension")
            return None
@@ -66,11 +60,15 @@ def inferShape(
    dilations: List[int],
 ) -> Tuple[int, ...]:
    assert (
-        len(x_shape) == len(w_shape) == len(pads) + 2 == len(dilations) + 2 == len(strides) + 2
+        len(x_shape)
+        == len(w_shape)
+        == len(pads) + 2
+        == len(dilations) + 2
+        == len(strides) + 2
    ), "x and w should have the same length; pads, strides, and dilatinos should have the same length; the length of pads should be that of x - 2"
    output_dims = [
        math.floor(
-            (x_shape[i+2] + 2 * pads[i] - dilations[i] * (w_shape[i+2] - 1) - 1)
+            (x_shape[i + 2] + 2 * pads[i] - dilations[i] * (w_shape[i + 2] - 1) - 1)
            / strides[i]
            + 1
        )
@@ -145,7 +143,9 @@ def test(
    check_error(
        lib.infiniopGetConvWorkspaceSize(descriptor, ctypes.byref(workspaceSize))
    )
-    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device)
+    workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(
+        torch_device
+    )
    workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8))

    for i in range(NUM_PRERUN if PROFILE else 1):
@@ -177,7 +177,7 @@ def test(
        elapsed = (time.time() - start_time) / NUM_ITERATIONS
        print(f"    lib time: {elapsed :6f}")

-    if (tensor_dtype == torch.float16):
+    if tensor_dtype == torch.float16:
        assert torch.allclose(y, ans, atol=0, rtol=1e-2)
    else:
        assert torch.allclose(y, ans, atol=0, rtol=1e-3)
@@ -188,8 +188,10 @@ def test_cpu(lib, test_cases):
    device = DeviceEnum.DEVICE_CPU
    handle = create_handle(lib, device)
    for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
+        # fmt: off
        test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
        test(lib, handle, "cpu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
+        # fmt: on
    destroy_handle(lib, handle)


@@ -197,8 +199,10 @@ def test_cuda(lib, test_cases):
    device = DeviceEnum.DEVICE_CUDA
    handle = create_handle(lib, device)
    for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
+        # fmt: off
        test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
        test(lib, handle, "cuda", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
+        # fmt: on
    destroy_handle(lib, handle)


@@ -208,8 +212,10 @@ def test_bang(lib, test_cases):
    device = DeviceEnum.DEVICE_BANG
    handle = create_handle(lib, device)
    for x_shape, w_shape, pads, strides, dilations, x_strides in test_cases:
+        # fmt: off
        test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float16)
        test(lib, handle, "mlu", x_shape, w_shape, pads, strides, dilations, x_strides, tensor_dtype=torch.float32)
+        # fmt: on
    destroy_handle(lib, handle)



--- a/test/infiniop/expand.py
+++ b/test/infiniop/expand.py
@@ -47,10 +47,10 @@ def test(
    lib,
    handle,
    torch_device,
-    y_shape, 
+    y_shape,
    x_shape,
-    y_stride=None, 
-    x_stride=None, 
+    y_stride=None,
+    x_stride=None,
    tensor_dtype=torch.float16,
 ):
    print(
@@ -109,8 +109,10 @@ def test_cpu(lib, test_cases):
    device = DeviceEnum.DEVICE_CPU
    handle = create_handle(lib, device)
    for y_shape, x_shape, y_stride, x_stride in test_cases:
+        # fmt: off
        test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
        test(lib, handle, "cpu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
+        # fmt: on
    destroy_handle(lib, handle)


@@ -118,8 +120,10 @@ def test_cuda(lib, test_cases):
    device = DeviceEnum.DEVICE_CUDA
    handle = create_handle(lib, device)
    for y_shape, x_shape, y_stride, x_stride in test_cases:
+        # fmt: off
        test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
        test(lib, handle, "cuda", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
+        # fmt: on
    destroy_handle(lib, handle)


@@ -129,13 +133,16 @@ def test_bang(lib, test_cases):
    device = DeviceEnum.DEVICE_BANG
    handle = create_handle(lib, device)
    for y_shape, x_shape, y_stride, x_stride in test_cases:
+        # fmt: off
        test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float16)
        test(lib, handle, "mlu", y_shape, x_shape, y_stride, x_stride, tensor_dtype=torch.float32)
+        # fmt: on
    destroy_handle(lib, handle)


 if __name__ == "__main__":
    test_cases = [
+        # fmt: off
        # y_shape, x_shape, y_stride, x_stride
        ((), (), None, None),
        ((3, 3), (1,), None, None),
@@ -146,6 +153,7 @@ if __name__ == "__main__":
        ((2, 3, 4, 5), (5,), None, None),
        ((3, 2, 4, 5), (3, 2, 1, 1), None, None),
        ((32, 256, 112, 112), (32, 256, 112, 1), None, None),
+        # fmt: on
    ]
    args = get_args()
    lib = open_lib()

--- a/test/infiniop/gemm.py
+++ b/test/infiniop/gemm.py
@@ -27,6 +27,7 @@ PROFILE = False
 NUM_PRERUN = 10
 NUM_ITERATIONS = 1000

+
 class GEMMDescriptor(Structure):
    _fields_ = [("device", c_int32)]

@@ -34,10 +35,15 @@ class GEMMDescriptor(Structure):
 infiniopGEMMDescriptor_t = POINTER(GEMMDescriptor)


-def gemm(A, B, C=None, transA=False, transB=False, alpha=1.0, beta=0.0, dtype=torch.float32):
+def gemm(
+    A, B, C=None, transA=False, transB=False, alpha=1.0, beta=0.0, dtype=torch.float32
+):
    A = A.T if transA else A
    B = B.T if transB else B
-    result = alpha * torch.matmul(A if dtype != torch.float16 else A.to(torch.float32), B if dtype != torch.float16 else B.to(torch.float32)).to(dtype)
+    result = alpha * torch.matmul(
+        A if dtype != torch.float16 else A.to(torch.float32),
+        B if dtype != torch.float16 else B.to(torch.float32),
+    ).to(dtype)
    if C is not None:
        result += beta * C if dtype != torch.float16 else C.to(torch.float32)
    if PROFILE:
@@ -64,7 +70,7 @@ def test(
    dtype=torch.float16,
 ):
    print(
-        f"Testing GEMM on {torch_device} with transA: {transA} transB: {transB} " 
+        f"Testing GEMM on {torch_device} with transA: {transA} transB: {transB} "
        f"a_shape:{a_shape} b_shape:{b_shape} c_shape:{c_shape} y_shape:{y_shape} "
        f"a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} y_stride:{y_stride} dtype:{dtype}"
    )
@@ -121,9 +127,7 @@ def test(

    workspace_size = ctypes.c_uint64(0)
    check_error(
-        lib.infiniopGetGEMMWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
+        lib.infiniopGetGEMMWorkspaceSize(descriptor, ctypes.byref(workspace_size))
    )
    workspace = torch.zeros(int(workspace_size.value), dtype=torch.uint8).to(
        torch_device
@@ -182,8 +186,10 @@ def test_cpu(lib, test_cases):
        c_stride,
        y_stride,
    ) in test_cases:
+        # fmt: off
        test(lib, handle, "cpu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
        test(lib, handle, "cpu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
+        # fmt: on
    destroy_handle(lib, handle)


@@ -204,8 +210,10 @@ def test_cuda(lib, test_cases):
        c_stride,
        y_stride,
    ) in test_cases:
+        # fmt: off
        test(lib, handle, "cuda", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
        test(lib, handle, "cuda", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
+        # fmt: on
    destroy_handle(lib, handle)


@@ -229,9 +237,10 @@ def test_bang(lib, test_cases):
        c_stride,
        y_stride,
    ) in test_cases:
+        # fmt: off
        test(lib, handle, "mlu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float16)
        test(lib, handle, "mlu", alpha, beta, transA, transB, a_shape, b_shape, c_shape, y_shape, a_stride, b_stride, c_stride, y_stride, dtype=torch.float32)
-
+        # fmt: on
    destroy_handle(lib, handle)



--- a/test/infiniop/global_avg_pool.py
+++ b/test/infiniop/global_avg_pool.py
@@ -99,7 +99,12 @@ def test(
    for i in range(NUM_PRERUN if PROFILE else 1):
        check_error(
            lib.infiniopGlobalAvgPool(
-                descriptor, workspace_ptr, workspaceSize, y_tensor.data, x_tensor.data, None
+                descriptor,
+                workspace_ptr,
+                workspaceSize,
+                y_tensor.data,
+                x_tensor.data,
+                None,
            )
        )
    if PROFILE:

--- a/test/infiniop/libinfiniop/__init__.py
+++ b/test/infiniop/libinfiniop/__init__.py
 import os
 import sys
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '.')))
-from .liboperators import open_lib, CTensor, infiniopHandle_t, infiniopTensorDescriptor_t
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".")))
+from .liboperators import (
+    open_lib,
+    CTensor,
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+)
 from .devices import *
 from .utils import *
 from .datatypes import *
--- a/test/infiniop/libinfiniop/datatypes.py
+++ b/test/infiniop/libinfiniop/datatypes.py
@@ -7,7 +7,7 @@ class InfiniDtype:
    I32 = 5
    I64 = 6
    U8 = 7
-    U16 = 8 
+    U16 = 8
    U32 = 9
    U64 = 10
    F8 = 11

--- a/test/infiniop/libinfiniop/utils.py
+++ b/test/infiniop/libinfiniop/utils.py
@@ -54,6 +54,7 @@ def create_workspace(size, torch_device):
    if size == 0:
        return None
    import torch
+
    return torch.zeros(size=(size,), dtype=torch.uint8, device=torch_device)


@@ -172,6 +173,7 @@ def get_args():

 def synchronize_device(torch_device):
    import torch
+
    if torch_device == "cuda":
        torch.cuda.synchronize()
    elif torch_device == "npu":
@@ -197,13 +199,24 @@ def debug(actual, desired, atol=0, rtol=1e-2, equal_nan=False, verbose=True):
        If True, the function will print detailed information about any discrepancies between the tensors.
    """
    import numpy as np
+
    print_discrepancy(actual, desired, atol, rtol, verbose)
-    np.testing.assert_allclose(actual.cpu(), desired.cpu(), rtol, atol, equal_nan, verbose=True, strict=True)
+    np.testing.assert_allclose(
+        actual.cpu(), desired.cpu(), rtol, atol, equal_nan, verbose=True, strict=True
+    )


-def debug_all(actual_vals: Sequence, desired_vals: Sequence, condition: str, atol=0, rtol=1e-2, equal_nan=False, verbose=True):
+def debug_all(
+    actual_vals: Sequence,
+    desired_vals: Sequence,
+    condition: str,
+    atol=0,
+    rtol=1e-2,
+    equal_nan=False,
+    verbose=True,
+):
    """
-    Debugging function to compare two sequences of values (actual and desired) pair by pair, results 
+    Debugging function to compare two sequences of values (actual and desired) pair by pair, results
    are linked by the given logical condition, and prints discrepancies
    Arguments:
    ----------
@@ -223,7 +236,10 @@ def debug_all(actual_vals: Sequence, desired_vals: Sequence, condition: str, ato
    - AssertionError: If the specified `condition` is not 'or' or 'and'.
    """
    assert len(actual_vals) == len(desired_vals), "Invalid Length"
-    assert condition in {"or", "and"}, "Invalid condition: should be either 'or' or 'and'"
+    assert condition in {
+        "or",
+        "and",
+    }, "Invalid condition: should be either 'or' or 'and'"
    import numpy as np

    passed = False if condition == "or" else True
@@ -237,14 +253,22 @@ def debug_all(actual_vals: Sequence, desired_vals: Sequence, condition: str, ato
        elif condition == "and":
            if passed and len(indices) != 0:
                passed = False
-                print(f"\033[31mThe condition has not been satisfied: Condition #{index + 1}\033[0m")
-            np.testing.assert_allclose(actual.cpu(), desired.cpu(), rtol, atol, equal_nan, verbose=True, strict=True)
+                print(
+                    f"\033[31mThe condition has not been satisfied: Condition #{index + 1}\033[0m"
+                )
+            np.testing.assert_allclose(
+                actual.cpu(),
+                desired.cpu(),
+                rtol,
+                atol,
+                equal_nan,
+                verbose=True,
+                strict=True,
+            )
    assert passed, "\033[31mThe condition has not been satisfied\033[0m"


-def print_discrepancy(
-    actual, expected, atol=0, rtol=1e-3, verbose=True
-):
+def print_discrepancy(actual, expected, atol=0, rtol=1e-3, verbose=True):
    if actual.shape != expected.shape:
        raise ValueError("Tensors must have the same shape to compare.")

@@ -273,7 +297,9 @@ def print_discrepancy(
        for idx in diff_indices:
            index_tuple = tuple(idx.tolist())
            actual_str = f"{actual[index_tuple]:<{col_width[1]}.{decimal_places[1]}f}"
-            expected_str = f"{expected[index_tuple]:<{col_width[2]}.{decimal_places[2]}f}"
+            expected_str = (
+                f"{expected[index_tuple]:<{col_width[2]}.{decimal_places[2]}f}"
+            )
            delta_str = f"{delta[index_tuple]:<{col_width[3]}.{decimal_places[3]}f}"
            print(
                f" > Index: {str(index_tuple):<{col_width[0]}}"
@@ -287,10 +313,18 @@ def print_discrepancy(
        print(f"  - Desired dtype: {expected.dtype}")
        print(f"  - Atol: {atol}")
        print(f"  - Rtol: {rtol}")
-        print(f"  - Mismatched elements: {len(diff_indices)} / {actual.numel()} ({len(diff_indices) / actual.numel() * 100}%)")
-        print(f"  - Min(actual) : {torch.min(actual):<{col_width[1]}} | Max(actual) : {torch.max(actual):<{col_width[2]}}")
-        print(f"  - Min(desired): {torch.min(expected):<{col_width[1]}} | Max(desired): {torch.max(expected):<{col_width[2]}}")
-        print(f"  - Min(delta)  : {torch.min(delta):<{col_width[1]}} | Max(delta)  : {torch.max(delta):<{col_width[2]}}")
+        print(
+            f"  - Mismatched elements: {len(diff_indices)} / {actual.numel()} ({len(diff_indices) / actual.numel() * 100}%)"
+        )
+        print(
+            f"  - Min(actual) : {torch.min(actual):<{col_width[1]}} | Max(actual) : {torch.max(actual):<{col_width[2]}}"
+        )
+        print(
+            f"  - Min(desired): {torch.min(expected):<{col_width[1]}} | Max(desired): {torch.max(expected):<{col_width[2]}}"
+        )
+        print(
+            f"  - Min(delta)  : {torch.min(delta):<{col_width[1]}} | Max(delta)  : {torch.max(delta):<{col_width[2]}}"
+        )
        print("-" * total_width + "\n")

    return diff_indices
@@ -298,14 +332,17 @@ def print_discrepancy(

 def get_tolerance(tolerance_map, tensor_dtype, default_atol=0, default_rtol=1e-3):
    """
-    Returns the atol and rtol for a given tensor data type in the tolerance_map. 
+    Returns the atol and rtol for a given tensor data type in the tolerance_map.
    If the given data type is not found, it returns the provided default tolerance values.
    """
-    return tolerance_map.get(tensor_dtype, {'atol': default_atol, 'rtol': default_rtol}).values()
+    return tolerance_map.get(
+        tensor_dtype, {"atol": default_atol, "rtol": default_rtol}
+    ).values()


 def timed_op(func, num_iterations, device):
    import time
+
    """ Function for timing operations with synchronization. """
    synchronize_device(device)
    start = time.time()
@@ -318,7 +355,7 @@ def timed_op(func, num_iterations, device):
 def profile_operation(desc, func, torch_device, NUM_PRERUN, NUM_ITERATIONS):
    """
    Unified profiling workflow that is used to profile the execution time of a given function.
-    It first performs a number of warmup runs, then performs timed execution and 
+    It first performs a number of warmup runs, then performs timed execution and
    prints the average execution time.

    Arguments:
@@ -328,11 +365,11 @@ def profile_operation(desc, func, torch_device, NUM_PRERUN, NUM_ITERATIONS):
    - torch_device (str): The device on which the operation runs, provided for timed execution.
    - NUM_PRERUN (int): The number of warmup runs.
    - NUM_ITERATIONS (int): The number of timed execution iterations, used to calculate the average execution time.
-    """    
+    """
    # Warmup runs
    for _ in range(NUM_PRERUN):
        func()
-    
+
    # Timed execution
    elapsed = timed_op(lambda: func(), NUM_ITERATIONS, torch_device)
    print(f" {desc} time: {elapsed * 1000 :6f} ms")
@@ -347,7 +384,7 @@ def test_operator(lib, device, test_func, test_cases, tensor_dtypes):
    - lib (ctypes.CDLL): The library object containing the operator implementations.
    - device (InfiniDeviceEnum): The device on which the operator should be tested. See device.py.
    - test_func (function): The test function to be executed for each test case.
-    - test_cases (list of tuples): A list of test cases, where each test case is a tuple of parameters 
+    - test_cases (list of tuples): A list of test cases, where each test case is a tuple of parameters
        to be passed to `test_func`.
    - tensor_dtypes (list): A list of tensor data types (e.g., `torch.float32`) to test.
    """
@@ -355,7 +392,13 @@ def test_operator(lib, device, test_func, test_cases, tensor_dtypes):
    try:
        for test_case in test_cases:
            for tensor_dtype in tensor_dtypes:
-                test_func(lib, handle, infiniDeviceEnum_str_map[device], *test_case, tensor_dtype)
+                test_func(
+                    lib,
+                    handle,
+                    infiniDeviceEnum_str_map[device],
+                    *test_case,
+                    tensor_dtype,
+                )
    finally:
        destroy_handle(lib, handle)

@@ -365,22 +408,26 @@ def get_test_devices(args):
    Using the given parsed Namespace to determine the devices to be tested.

    Argument:
-    - args: the parsed Namespace object. 
+    - args: the parsed Namespace object.

    Return:
    - devices_to_test: the devices that will be tested. Default is CPU.
    """
    devices_to_test = []

-    if args.cpu: devices_to_test.append(InfiniDeviceEnum.CPU)
-    if args.nvidia: devices_to_test.append(InfiniDeviceEnum.NVIDIA)
-    if args.cambricon: 
+    if args.cpu:
+        devices_to_test.append(InfiniDeviceEnum.CPU)
+    if args.nvidia:
+        devices_to_test.append(InfiniDeviceEnum.NVIDIA)
+    if args.cambricon:
        import torch_mlu
+
        devices_to_test.append(InfiniDeviceEnum.CAMBRICON)
-    if args.ascend: 
+    if args.ascend:
        import torch
        import torch_npu
-        torch.npu.set_device(0) # Ascend NPU needs explicit device initialization
+
+        torch.npu.set_device(0)  # Ascend NPU needs explicit device initialization
        devices_to_test.append(InfiniDeviceEnum.ASCEND)
    if not devices_to_test:
        devices_to_test = [InfiniDeviceEnum.CPU]