feat: 添加大模型算子operator.cc，cpu和cuda视warning为错误

7a833987 · PanZezhongQY · e3ea5bae · 7a833987 · 7a833987 · 7a833987
Commit 7a833987 authored Feb 13, 2025 by PanZezhongQY
13 changed files
--- a/src/infiniop/devices/cpu/common_cpu.cc
+++ b/src/infiniop/devices/cpu/common_cpu.cc
@@ -52,14 +52,14 @@ uint16_t f32_to_f16(float val) {
        // Infinity
        return sign | 0x7C00;
    } else if (exponent >= -14) {// Normalized case
-        return sign | ((exponent + 15) << 10) | (mantissa >> 13);
+        return (uint16_t)(sign | ((exponent + 15) << 10) | (mantissa >> 13));
    } else if (exponent >= -24) {
        mantissa |= 0x800000;// Add implicit leading 1
        mantissa >>= (-14 - exponent);
-        return sign | (mantissa >> 13);
+        return (uint16_t)(sign | (mantissa >> 13));
    } else {
        // Too small for subnormal: return signed zero
-        return sign;
+        return (uint16_t)sign;
    }
 }


--- a/src/infiniop/ops/causal_softmax/operator.cc
+++ b/src/infiniop/ops/causal_softmax/operator.cc
+#include "infiniop/ops/causal_softmax.h"
+
+__C infiniopStatus_t infiniopCreateCausalSoftmaxDescriptor(
+    infiniopHandle_t handle,
+    infiniopCausalSoftmaxDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc) {
+    switch (handle->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuCreateCausalSoftmaxDescriptor(handle, (CausalSoftmaxCpuDescriptor_t *) desc_ptr, y_desc);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaCreateCausalSoftmaxDescriptor((CudaHandle_t)handle, (CausalSoftmaxCudaDescriptor_t *) desc_ptr, y_desc);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            return bangCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxBangDescriptor_t *) desc_ptr, y_desc);
+            // return cnnlCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxCnnlDescriptor_t *) desc_ptr, y_desc);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return aclnnCreateCausalSoftmaxDescriptor((AscendHandle_t) handle, (CausalSoftmaxAclnnDescriptor_t *) desc_ptr, y_desc);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaCreateCausalSoftmaxDescriptor((MacaHandle_t) handle, (CausalSoftmaxMacaDescriptor_t *) desc_ptr, y_desc);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaCreateCausalSoftmaxDescriptor((MusaHandle_t) handle, (CausalSoftmaxMusaDescriptor_t *) desc_ptr, y_desc);
+        }
+#endif
+    }
+    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniopStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDescriptor_t desc, uint64_t *size) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCpuDescriptor_t) desc, size);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCudaDescriptor_t) desc, size);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            return bangGetCausalSoftmaxWorkspaceSize((CausalSoftmaxBangDescriptor_t) desc, size);
+            // return cnnlGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCnnlDescriptor_t) desc, size);
+        }
+
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return aclnnGetCausalSoftmaxWorkspaceSize((CausalSoftmaxAclnnDescriptor_t) desc, size);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMacaDescriptor_t) desc, size);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMusaDescriptor_t) desc, size);
+        }
+#endif
+    }
+    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniopStatus_t infiniopCausalSoftmax(infiniopCausalSoftmaxDescriptor_t desc, void *workspace, uint64_t workspace_size, void *data, void *stream) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuCausalSoftmax((CausalSoftmaxCpuDescriptor_t) desc, workspace, workspace_size, data, stream);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaCausalSoftmax((CausalSoftmaxCudaDescriptor_t) desc, workspace, workspace_size, data, stream);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            return bangCausalSoftmax((CausalSoftmaxBangDescriptor_t) desc, workspace, workspace_size, data, stream);
+            // return cnnlCausalSoftmax((CausalSoftmaxCnnlDescriptor_t) desc, workspace, workspace_size, data, stream);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return aclnnCausalSoftmax((CausalSoftmaxAclnnDescriptor_t) desc, workspace, workspace_size, data, stream);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaCausalSoftmax((CausalSoftmaxMacaDescriptor_t) desc, workspace, workspace_size, data, stream);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaCausalSoftmax((CausalSoftmaxMusaDescriptor_t) desc, workspace, workspace_size, data, stream);
+        }
+#endif
+    }
+    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniopStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxDescriptor_t desc) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuDestroyCausalSoftmaxDescriptor((CausalSoftmaxCpuDescriptor_t) desc);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaDestroyCausalSoftmaxDescriptor((CausalSoftmaxCudaDescriptor_t) desc);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            return bangDestroyCausalSoftmaxDescriptor((CausalSoftmaxBangDescriptor_t) desc);
+            // return cnnlDestroyCausalSoftmaxDescriptor((CausalSoftmaxCnnlDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return aclnnDestroyCausalSoftmaxDescriptor((CausalSoftmaxAclnnDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMacaDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu:
+            return musaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMusaDescriptor_t) desc);
+#endif
+    }
+    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
--- a/src/infiniop/ops/matmul/blas.h
+++ b/src/infiniop/ops/matmul/blas.h
@@ -47,8 +47,8 @@ typedef struct BlasMatrix {
        *status = INFINIOP_STATUS_SUCCESS;
    }

-    bool match_batch(size_t batch) const {
-        return this->batch == batch || this->batch == 1;
+    bool match_batch(size_t _batch) const {
+        return this->batch == _batch || this->batch == 1;
    }

    void transpose() {
@@ -56,7 +56,7 @@ typedef struct BlasMatrix {
        std::swap(row_stride, col_stride);
    }

-    int ld() const {
+    int64_t ld() const {
        if (this->row_stride == 1) {
            return this->col_stride;
        } else {

--- a/src/infiniop/ops/matmul/cuda/matmul_cuda_kernel.cu
+++ b/src/infiniop/ops/matmul/cuda/matmul_cuda_kernel.cu
@@ -31,24 +31,24 @@ infiniopStatus_t matmul_cuda(infiniopMatmulCudaDescriptor_t desc, void *c, float
                                                handle,
                                                op_a,
                                                op_b,
-                                                info.m,
-                                                info.n,
-                                                info.k,
+                                                static_cast<int>(info.m),
+                                                static_cast<int>(info.n),
+                                                static_cast<int>(info.k),
                                                &alpha,
                                                a,
                                                a_type,
-                                                info.a_matrix.ld(),
+                                                static_cast<int>(info.a_matrix.ld()),
                                                info.a_matrix.stride,
                                                b,
                                                b_type,
-                                                info.b_matrix.ld(),
+                                                static_cast<int>(info.b_matrix.ld()),
                                                info.b_matrix.stride,
                                                &beta,
                                                c,
                                                c_type,
-                                                info.c_matrix.ld(),
+                                                static_cast<int>(info.c_matrix.ld()),
                                                info.c_matrix.stride,
-                                                info.batch,
+                                                static_cast<int>(info.batch),
                                                compute_type,
                                                CUBLAS_GEMM_DEFAULT_TENSOR_OP); });
    return INFINIOP_STATUS_SUCCESS;

--- a/src/infiniop/ops/matmul/operator.cc
+++ b/src/infiniop/ops/matmul/operator.cc
-#include "../utils.h"
 #include "infiniop/ops/matmul.h"

 #ifdef ENABLE_CPU_API

--- a/src/infiniop/ops/random_sample/operator.cc
+++ b/src/infiniop/ops/random_sample/operator.cc
+#include "infiniop/ops/random_sample.h"
+
+__C infiniopStatus_t infiniopCreateRandomSampleDescriptor(infiniopHandle_t handle, infiniopRandomSampleDescriptor_t *desc_ptr, infiniopTensorDescriptor_t result, infiniopTensorDescriptor_t probs) {
+    switch (handle->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuCreateRandomSampleDescriptor(handle, (RandomSampleCpuDescriptor_t *) desc_ptr, result, probs);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu:
+            return cudaCreateRandomSampleDescriptor((CudaHandle_t) handle, (RandomSampleCudaDescriptor_t *) desc_ptr, result, probs);
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            return bangCreateRandomSampleDescriptor((BangHandle_t) handle,
+                                                    (RandomSampleBangDescriptor_t *) desc_ptr, result,
+                                                    probs);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return ascendCreateRandomSampleDescriptor((AscendHandle_t) handle,
+                                                     (RandomSampleAscendDescriptor_t *) desc_ptr, result, probs);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaCreateRandomSampleDescriptor((MacaHandle_t) handle,
+                                                    (RandomSampleMacaDescriptor_t *) desc_ptr, result,
+                                                    probs);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu:
+            return musaCreateRandomSampleDescriptor((MusaHandle_t) handle, (RandomSampleMusaDescriptor_t *) desc_ptr, result, probs);
+#endif
+    }
+    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+};
+
+__C infiniopStatus_t infiniopGetRandomSampleWorkspaceSize(infiniopRandomSampleDescriptor_t desc, uint64_t *size) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuGetRandomSampleWorkspaceSize((RandomSampleCpuDescriptor_t) desc, size);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaGetRandomSampleWorkspaceSize((RandomSampleCudaDescriptor_t) desc, size);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            return bangGetRandomSampleWorkspaceSize((RandomSampleBangDescriptor_t) desc, size);
+            // return cnnlGetRandomSampleWorkspaceSize((RandomSampleCnnlDescriptor_t) desc, size);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return ascendGetRandomSampleWorkspaceSize((RandomSampleAscendDescriptor_t) desc, size);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaGetRandomSampleWorkspaceSize((RandomSampleMacaDescriptor_t) desc, size);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaGetRandomSampleWorkspaceSize((RandomSampleMusaDescriptor_t) desc, size);
+        }
+#endif
+    }
+    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniopStatus_t infiniopRandomSample(infiniopRandomSampleDescriptor_t desc,
+                                          void *workspace,
+                                          uint64_t workspace_size,
+                                          void *result,
+                                          void const *probs,
+                                          float random_val,
+                                          float topp,
+                                          int topk,
+                                          float temperature,
+                                          void *stream) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuRandomSample((RandomSampleCpuDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu:
+            return cudaRandomSample((RandomSampleCudaDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            return bangRandomSample((RandomSampleBangDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return ascendRandomSample((RandomSampleAscendDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaRandomSample((RandomSampleMacaDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu:
+            return musaRandomSample((RandomSampleMusaDescriptor_t) desc, workspace, workspace_size, result, probs, random_val, topp, topk, temperature, stream);
+#endif
+    }
+    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniopStatus_t infiniopDestroyRandomSampleDescriptor(infiniopRandomSampleDescriptor_t desc) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuDestroyRandomSampleDescriptor((RandomSampleCpuDescriptor_t) desc);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu:
+            return cudaDestroyRandomSampleDescriptor((RandomSampleCudaDescriptor_t) desc);
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            return bangDestroyRandomSampleDescriptor((RandomSampleBangDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return ascendDestroyRandomSampleDescriptor((RandomSampleAscendDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaDestroyRandomSampleDescriptor((RandomSampleMacaDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu:
+            return musaDestroyRandomSampleDescriptor((RandomSampleMusaDescriptor_t) desc);
+#endif
+    }
+    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
--- a/src/infiniop/ops/rearrange/operator.cc
+++ b/src/infiniop/ops/rearrange/operator.cc
+#include "infiniop/ops/rearrange.h"
+
+__C infiniopStatus_t infiniopCreateRearrangeDescriptor(
+    infiniopHandle_t handle,
+    infiniopRearrangeDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t dst,
+    infiniopTensorDescriptor_t src) {
+    switch (handle->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuCreateRearrangeDescriptor(handle, (RearrangeCpuDescriptor_t *) desc_ptr, dst, src);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaCreateRearrangeDescriptor((CudaHandle_t) handle, (RearrangeCudaDescriptor_t *) desc_ptr, dst, src);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            return bangCreateRearrangeDescriptor((BangHandle_t) handle, (RearrangeBangDescriptor_t *) desc_ptr, dst, src);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return aclnnCreateRearrangeDescriptor((AscendHandle_t) handle,
+                                                  (RearrangeAclnnDescriptor_t *) desc_ptr,
+                                                  dst,
+                                                  src);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaCreateRearrangeDescriptor((MacaHandle_t) handle, (RearrangeMacaDescriptor_t *) desc_ptr, dst, src);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaCreateRearrangeDescriptor((MusaHandle_t)handle, (RearrangeMusaDescriptor_t *) desc_ptr, dst, src);
+        }
+#endif
+    }
+    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniopStatus_t infiniopRearrange(infiniopRearrangeDescriptor_t desc, void *dst, void const *src, void *stream) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuRearrange((RearrangeCpuDescriptor_t) desc, dst, src, stream);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaRearrange((RearrangeCudaDescriptor_t) desc, dst, src, stream);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            return bangRearrange((RearrangeBangDescriptor_t) desc, dst, src, stream);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return aclnnRearrange((RearrangeAclnnDescriptor_t) desc,
+                                  dst,
+                                  src,
+                                  stream);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaRearrange((RearrangeMacaDescriptor_t) desc, dst, src, stream);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaRearrange((RearrangeMusaDescriptor_t) desc, dst, src, stream);
+        }
+#endif
+    }
+    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniopStatus_t infiniopDestroyRearrangeDescriptor(infiniopRearrangeDescriptor_t desc) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuDestroyRearrangeDescriptor((RearrangeCpuDescriptor_t) desc);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaDestroyRearrangeDescriptor((RearrangeCudaDescriptor_t) desc);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            return bangDestroyRearrangeDescriptor((RearrangeBangDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return aclnnDestroyRearrangeDescriptor((RearrangeAclnnDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaDestroyRearrangeDescriptor((RearrangeMacaDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaDestroyRearrangeDescriptor((RearrangeMusaDescriptor_t) desc);
+        }
+#endif
+    }
+    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
--- a/src/infiniop/ops/rms_norm/operator.cc
+++ b/src/infiniop/ops/rms_norm/operator.cc
+#include "infiniop/ops/rms_norm.h"
+
+__C infiniopStatus_t infiniopCreateRMSNormDescriptor(
+    infiniopHandle_t handle,
+    infiniopRMSNormDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t w_desc,
+    float epsilon) {
+    switch (handle->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuCreateRMSNormDescriptor(handle, (RMSNormCpuDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaCreateRMSNormDescriptor((CudaHandle_t) handle, (RMSNormCudaDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon);
+        }
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            return bangCreateRMSNormDescriptor((BangHandle_t) handle, (RMSNormBangDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return aclnnCreateRMSNormDescriptor((AscendHandle_t) handle,
+                                                (RMSNormAclnnDescriptor_t *) desc_ptr,
+                                                y_desc,
+                                                x_desc,
+                                                w_desc,
+                                                epsilon);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaCreateRMSNormDescriptor((MacaHandle_t) handle, (RMSNormMacaDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaCreateRMSNormDescriptor((MusaHandle_t) handle, (RMSNormMusaDescriptor_t *) desc_ptr, y_desc, x_desc, w_desc, epsilon);
+        }
+#endif
+    }
+    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniopStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t desc, uint64_t *size) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuGetRMSNormWorkspaceSize((RMSNormCpuDescriptor_t) desc, size);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaGetRMSNormWorkspaceSize((RMSNormCudaDescriptor_t) desc, size);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            return bangGetRMSNormWorkspaceSize((RMSNormBangDescriptor_t) desc, size);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return aclnnGetRMSNormWorkspaceSize((RMSNormAclnnDescriptor_t) desc,
+                                                size);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaGetRMSNormWorkspaceSize((RMSNormMacaDescriptor_t) desc, size);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaGetRMSNormWorkspaceSize((RMSNormMusaDescriptor_t) desc, size);
+        }
+#endif
+    }
+    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniopStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *workspace, uint64_t workspace_size,
+                                     void *y, void const *x, void const *w, void *stream) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuRMSNorm((RMSNormCpuDescriptor_t) desc, workspace, workspace_size, y, x, w, stream);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaRMSNorm((RMSNormCudaDescriptor_t) desc, workspace, workspace_size, y, x, w, stream);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            return bangRMSNorm((RMSNormBangDescriptor_t) desc, workspace, workspace_size, y, x, w, stream);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return aclnnRMSNorm((RMSNormAclnnDescriptor_t) desc,
+                                workspace,
+                                workspace_size,
+                                y,
+                                x,
+                                w,
+                                stream);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaRMSNorm((RMSNormMacaDescriptor_t) desc, workspace, workspace_size, y, x, w, stream);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaRMSNorm((RMSNormMusaDescriptor_t) desc, workspace, workspace_size, y, x, w, stream);
+        }
+#endif
+    }
+    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniopStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t desc) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+        case DevCpu:
+            return cpuDestroyRMSNormDescriptor((RMSNormCpuDescriptor_t) desc);
+#endif
+#ifdef ENABLE_NV_GPU
+        case DevNvGpu: {
+            return cudaDestroyRMSNormDescriptor((RMSNormCudaDescriptor_t) desc);
+        }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+        case DevCambriconMlu: {
+            return bangDestroyRMSNormDescriptor((RMSNormBangDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+        case DevAscendNpu: {
+            return aclnnDestroyRMSNormDescriptor((RMSNormAclnnDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_METAX_GPU
+        case DevMetaxGpu: {
+            return macaDestroyRMSNormDescriptor((RMSNormMacaDescriptor_t) desc);
+        }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+        case DevMthreadsGpu: {
+            return musaDestroyRMSNormDescriptor((RMSNormMusaDescriptor_t) desc);
+        }
+#endif
+    }
+    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
--- a/src/infiniop/ops/rotary_embedding/operator.cc
+++ b/src/infiniop/ops/rotary_embedding/operator.cc
+#include "infiniop/ops/rotary_embedding.h"
+
+__C infiniopStatus_t infiniopCreateRoPEDescriptor(
+    infiniopHandle_t handle, infiniopRoPEDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t t, infiniopTensorDescriptor_t pos_ids,
+    infiniopTensorDescriptor_t sin_table,
+    infiniopTensorDescriptor_t cos_table) {
+    switch (handle->device) {
+#ifdef ENABLE_CPU
+    case DevCpu:
+        return cpuCreateRoPEDescriptor((CpuHandle_t)handle,
+                                       (RoPECpuDescriptor_t *)desc_ptr, t,
+                                       pos_ids, sin_table, cos_table);
+#endif
+#ifdef ENABLE_NV_GPU
+    case DevNvGpu: {
+        return cudaCreateRoPEDescriptor((CudaHandle_t)handle,
+                                        (RoPECudaDescriptor_t *)desc_ptr, t,
+                                        pos_ids, sin_table, cos_table);
+    }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+    case DevCambriconMlu: {
+        return bangCreateRoPEDescriptor((BangHandle_t)handle,
+                                        (RoPEBangDescriptor_t *)desc_ptr, t,
+                                        pos_ids, sin_table, cos_table);
+    }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+    case DevAscendNpu: {
+        return ascendCreateRoPEDescriptor((AscendHandle_t)handle,
+                                          (RoPEAscendDescriptor_t *)desc_ptr, t,
+                                          pos_ids, sin_table, cos_table);
+    }
+#endif
+#ifdef ENABLE_METAX_GPU
+    case DevMetaxGpu: {
+        return macaCreateRoPEDescriptor((MacaHandle_t)handle,
+                                        (RoPEMacaDescriptor_t *)desc_ptr, t,
+                                        pos_ids, sin_table, cos_table);
+    }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+    case DevMthreadsGpu: {
+        return musaCreateRoPEDescriptor((MusaHandle_t)handle,
+                                        (RoPEMusaDescriptor_t *)desc_ptr, t,
+                                        pos_ids, sin_table, cos_table);
+    }
+#endif
+    }
+    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniopStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc,
+                                                  uint64_t *size) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+    case DevCpu:
+        return cpuGetRoPEWorkspaceSize((RoPECpuDescriptor_t)desc, size);
+#endif
+#ifdef ENABLE_NV_GPU
+    case DevNvGpu: {
+        return cudaGetRoPEWorkspaceSize((RoPECudaDescriptor_t)desc, size);
+    }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+    case DevCambriconMlu: {
+        return bangGetRoPEWorkspaceSize((RoPEBangDescriptor_t)desc, size);
+    }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+    case DevAscendNpu: {
+        return ascendGetRoPEWorkspaceSize((RoPEAscendDescriptor_t)desc, size);
+    }
+#endif
+#ifdef ENABLE_METAX_GPU
+    case DevMetaxGpu: {
+        return macaGetRoPEWorkspaceSize((RoPEMacaDescriptor_t)desc, size);
+    }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+    case DevMthreadsGpu: {
+        return musaGetRoPEWorkspaceSize((RoPEMusaDescriptor_t)desc, size);
+    }
+#endif
+    }
+    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniopStatus_t infiniopRoPE(infiniopRoPEDescriptor_t desc,
+                                  void *workspace, uint64_t workspace_size,
+                                  void *t, void const *pos_ids,
+                                  void const *sin_table, void const *cos_table,
+                                  void *stream) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+    case DevCpu:
+        return cpuRoPE((RoPECpuDescriptor_t)desc, workspace, workspace_size, t,
+                       pos_ids, sin_table, cos_table, stream);
+#endif
+#ifdef ENABLE_NV_GPU
+    case DevNvGpu: {
+        return cudaRoPE((RoPECudaDescriptor_t)desc, workspace, workspace_size,
+                        t, pos_ids, sin_table, cos_table, stream);
+    }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+    case DevCambriconMlu: {
+        return bangRoPE((RoPEBangDescriptor_t)desc, workspace, workspace_size,
+                        t, pos_ids, sin_table, cos_table, stream);
+    }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+    case DevAscendNpu: {
+        return ascendRoPE((RoPEAscendDescriptor_t)desc, workspace,
+                          workspace_size, t, pos_ids, sin_table, cos_table,
+                          stream);
+    }
+#endif
+#ifdef ENABLE_METAX_GPU
+    case DevMetaxGpu: {
+        return macaRoPE((RoPEMacaDescriptor_t)desc, workspace, workspace_size,
+                        t, pos_ids, sin_table, cos_table, stream);
+    }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+    case DevMthreadsGpu: {
+        return musaRoPE((RoPEMusaDescriptor_t)desc, workspace, workspace_size,
+                        t, pos_ids, sin_table, cos_table, stream);
+    }
+#endif
+    }
+    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniopStatus_t
+infiniopDestroyRoPEDescriptor(infiniopRoPEDescriptor_t desc) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+    case DevCpu:
+        return cpuDestroyRoPEDescriptor((RoPECpuDescriptor_t)desc);
+#endif
+#ifdef ENABLE_NV_GPU
+    case DevNvGpu: {
+        return cudaDestroyRoPEDescriptor((RoPECudaDescriptor_t)desc);
+    }
+
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+    case DevCambriconMlu: {
+        return bangDestroyRoPEDescriptor((RoPEBangDescriptor_t)desc);
+    }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+    case DevAscendNpu: {
+        return ascendDestroyRoPEDescriptor((RoPEAscendDescriptor_t)desc);
+    }
+#endif
+#ifdef ENABLE_METAX_GPU
+    case DevMetaxGpu: {
+        return macaDestroyRoPEDescriptor((RoPEMacaDescriptor_t)desc);
+    }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+    case DevMthreadsGpu: {
+        return musaDestroyRoPEDescriptor((RoPEMusaDescriptor_t)desc);
+    }
+#endif
+    }
+    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
--- a/src/infiniop/ops/swiglu/operator.cc
+++ b/src/infiniop/ops/swiglu/operator.cc
+#include "infiniop/ops/swiglu.h"
+
+__C infiniopStatus_t infiniopCreateSwiGLUDescriptor(
+    infiniopHandle_t handle, infiniopSwiGLUDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+    switch (handle->device) {
+#ifdef ENABLE_CPU
+    case DevCpu:
+        return cpuCreateSwiGLUDescriptor(
+            handle, (SwiGLUCpuDescriptor_t *)desc_ptr, c_desc, a_desc, b_desc);
+#endif
+#ifdef ENABLE_NV_GPU
+    case DevNvGpu:
+        return cudaCreateSwiGLUDescriptor((CudaHandle_t)handle,
+                                          (SwiGLUCudaDescriptor_t *)desc_ptr,
+                                          c_desc, a_desc, b_desc);
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+    case DevCambriconMlu: {
+        return bangCreateSwiGLUDescriptor((BangHandle_t)handle,
+                                          (SwiGLUBangDescriptor_t *)desc_ptr,
+                                          c_desc, a_desc, b_desc);
+    }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+    case DevAscendNpu:
+        return ascendCreateSwiGLUDescriptor(
+            (AscendHandle_t)handle, (SwiGLUAscendDescriptor_t *)desc_ptr,
+            c_desc, a_desc, b_desc);
+#endif
+#ifdef ENABLE_METAX_GPU
+    case DevMetaxGpu: {
+        return macaCreateSwiGLUDescriptor((MacaHandle_t)handle,
+                                          (SwiGLUMacaDescriptor_t *)desc_ptr,
+                                          c_desc, a_desc, b_desc);
+    }
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+    case DevMthreadsGpu:
+        return musaCreateSwiGLUDescriptor(
+            handle, (SwiGLUMusaDescriptor_t *)desc_ptr, c_desc, a_desc, b_desc);
+#endif
+    }
+    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+};
+
+__C infiniopStatus_t infiniopSwiGLU(infiniopSwiGLUDescriptor_t desc, void *c,
+                                    void const *a, void const *b,
+                                    void *stream) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+    case DevCpu:
+        return cpuSwiGLU((SwiGLUCpuDescriptor_t)desc, c, a, b, stream);
+#endif
+#ifdef ENABLE_NV_GPU
+    case DevNvGpu:
+        return cudaSwiGLU((SwiGLUCudaDescriptor_t)desc, c, a, b, stream);
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+    case DevCambriconMlu: {
+        return bangSwiGLU((SwiGLUBangDescriptor_t)desc, c, a, b, stream);
+    }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+    case DevAscendNpu:
+        return ascendSwiGLU((SwiGLUAscendDescriptor_t)desc, c, a, b, stream);
+#endif
+#ifdef ENABLE_METAX_GPU
+    case DevMetaxGpu:
+        return macaSwiGLU((SwiGLUMacaDescriptor_t)desc, c, a, b, stream);
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+    case DevMthreadsGpu:
+        return musaSwiGLU((SwiGLUMusaDescriptor_t)desc, c, a, b, stream);
+#endif
+    }
+    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniopStatus_t
+infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
+    switch (desc->device) {
+#ifdef ENABLE_CPU
+    case DevCpu:
+        return cpuDestroySwiGLUDescriptor((SwiGLUCpuDescriptor_t)desc);
+#endif
+#ifdef ENABLE_NV_GPU
+    case DevNvGpu:
+        return cudaDestroySwiGLUDescriptor((SwiGLUCudaDescriptor_t)desc);
+#endif
+#ifdef ENABLE_CAMBRICON_MLU
+    case DevCambriconMlu: {
+        return bangDestroySwiGLUDescriptor((SwiGLUBangDescriptor_t)desc);
+    }
+#endif
+#ifdef ENABLE_ASCEND_NPU
+    case DevAscendNpu:
+        return ascendDestroySwiGLUDescriptor((SwiGLUAscendDescriptor_t)desc);
+#endif
+#ifdef ENABLE_METAX_GPU
+    case DevMetaxGpu:
+        return macaDestroySwiGLUDescriptor((SwiGLUMacaDescriptor_t)desc);
+#endif
+#ifdef ENABLE_MTHREADS_GPU
+    case DevMthreadsGpu:
+        return musaDestroySwiGLUDescriptor((SwiGLUMusaDescriptor_t)desc);
+#endif
+    }
+    return INFINIOP_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
--- a/src/infiniop/ops/utils.h
+++ b/src/infiniop/ops/utils.h
@@ -13,32 +13,28 @@

 #define ROUND_UP_DIV(x, y) ((x + y - 1) / y)

-#define CHECK_ERROR(call, target, errCode)                   \
-    do {                                                     \
-        if (auto value = (call); value == (target)) {        \
-            std::cerr << "Error: expected " << (target)      \
-                      << " but got " << value                \
-                      << " in file " << __FILE__             \
-                      << ", function " << __func__           \
-                      << ", line " << __LINE__ << std::endl; \
-            return (errCode);                                \
-        }                                                    \
+#define CHECK_ERROR(call, target, errCode)                                     \
+    do {                                                                       \
+        if (auto value = (call); value == (target)) {                          \
+            std::cerr << "Error: expected " << (target) << " but got "         \
+                      << value << " in file " << __FILE__ << ", function "     \
+                      << __func__ << ", line " << __LINE__ << std::endl;       \
+            return (errCode);                                                  \
+        }                                                                      \
    } while (0)

-#define CREATE_CHECK_ERROR(expr, value, target, errCode) \
-    expr;                                                \
+#define CREATE_CHECK_ERROR(expr, value, target, errCode)                       \
+    expr;                                                                      \
    CHECK_ERROR(value, target, errCode)

-#define CHECK_STATUS(call, target)                           \
-    do {                                                     \
-        if (auto value = (call); value != (target)) {        \
-            std::cerr << "Error: expected " << (target)      \
-                      << " but got " << value                \
-                      << " in file " << __FILE__             \
-                      << ", function " << __func__           \
-                      << ", line " << __LINE__ << std::endl; \
-            return value;                                    \
-        }                                                    \
+#define CHECK_STATUS(call, target)                                             \
+    do {                                                                       \
+        if (auto value = (call); value != (target)) {                          \
+            std::cerr << "Error: expected " << (target) << " but got "         \
+                      << value << " in file " << __FILE__ << ", function "     \
+                      << __func__ << ", line " << __LINE__ << std::endl;       \
+            return value;                                                      \
+        }                                                                      \
    } while (0)

 inline std::vector<int64_t> get_byte_strides(infiniopTensorDescriptor_t desc) {
@@ -53,8 +49,9 @@ inline std::vector<int64_t> get_byte_strides(infiniopTensorDescriptor_t desc) {
 // calculate the broadcasted shape for two tensors
 inline bool getBroadcastShape(const uint64_t *shape1, uint64_t ndim1,
                              const uint64_t *shape2, uint64_t ndim2,
-                              uint64_t *broadcast_shape, uint64_t *padded_shape1,
-                              uint64_t *padded_shape2, uint64_t max_rank) {
+                              uint64_t *broadcast_shape,
+                              uint64_t *padded_shape1, uint64_t *padded_shape2,
+                              uint64_t max_rank) {
    // prepending and initializing
    std::fill(padded_shape1, padded_shape1 + max_rank, 1);
    std::fill(padded_shape2, padded_shape2 + max_rank, 1);
@@ -63,7 +60,8 @@ inline bool getBroadcastShape(const uint64_t *shape1, uint64_t ndim1,

    // compute broadcasted shape
    for (size_t i = 0; i < max_rank; ++i) {
-        if (padded_shape1[i] == padded_shape2[i] || padded_shape1[i] == 1 || padded_shape2[i] == 1) {
+        if (padded_shape1[i] == padded_shape2[i] || padded_shape1[i] == 1 ||
+            padded_shape2[i] == 1) {
            broadcast_shape[i] = std::max(padded_shape1[i], padded_shape2[i]);
        } else {
            return false;
@@ -73,31 +71,39 @@ inline bool getBroadcastShape(const uint64_t *shape1, uint64_t ndim1,
    return true;
 }

-// check if the shape of tensor c is valid after broadcasting tensors a and b and also get the broadcasted shapes
-inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a, infiniopTensorDescriptor_t b, infiniopTensorDescriptor_t c,
+// check if the shape of tensor c is valid after broadcasting tensors a and b
+// and also get the broadcasted shapes
+inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a,
+                                  infiniopTensorDescriptor_t b,
+                                  infiniopTensorDescriptor_t c,
                                  uint64_t broadcast_ndim) {
-    std::vector<uint64_t>
-        broadcast_shape_(broadcast_ndim),
-        padded_shape1_(broadcast_ndim),
-        padded_shape2_(broadcast_ndim);
+    std::vector<uint64_t> broadcast_shape_(broadcast_ndim),
+        padded_shape1_(broadcast_ndim), padded_shape2_(broadcast_ndim);
    auto broadcast_shape = broadcast_shape_.data(),
         padded_shape1 = padded_shape1_.data(),
         padded_shape2 = padded_shape2_.data();
-    if (broadcast_ndim != c->ndim || !getBroadcastShape(a->shape, a->ndim, b->shape, b->ndim, broadcast_shape, padded_shape1, padded_shape2, broadcast_ndim)) {
+    if (broadcast_ndim != c->ndim ||
+        !getBroadcastShape(a->shape, a->ndim, b->shape, b->ndim,
+                           broadcast_shape, padded_shape1, padded_shape2,
+                           broadcast_ndim)) {
        return false;
    }
-    return std::equal(broadcast_shape, broadcast_shape + broadcast_ndim, c->shape);
+    return std::equal(broadcast_shape, broadcast_shape + broadcast_ndim,
+                      c->shape);
 }

-// check if the shape of tensor src can be validly broadcasted to that of the tensor dst
-inline bool isValidBroadcastShape(infiniopTensorDescriptor_t dst, infiniopTensorDescriptor_t src) {
+// check if the shape of tensor src can be validly broadcasted to that of the
+// tensor dst
+inline bool isValidBroadcastShape(infiniopTensorDescriptor_t dst,
+                                  infiniopTensorDescriptor_t src) {
    if (dst->ndim < src->ndim) {
        return false;
    }
    std::vector<size_t> padded_shape_(dst->ndim);
    auto padded_shape = padded_shape_.data();
    std::fill(padded_shape, padded_shape + dst->ndim, 1);
-    std::copy(src->shape, src->shape + src->ndim, padded_shape + dst->ndim - src->ndim);
+    std::copy(src->shape, src->shape + src->ndim,
+              padded_shape + dst->ndim - src->ndim);
    for (size_t i = 0; i < dst->ndim; ++i) {
        if (padded_shape[i] != dst->shape[i] && padded_shape[i] != 1) {
            return false;
@@ -107,7 +113,9 @@ inline bool isValidBroadcastShape(infiniopTensorDescriptor_t dst, infiniopTensor
 }

 // check if the shape of tensor c is valid after broadcasting tensors a and b
-inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a, infiniopTensorDescriptor_t b, infiniopTensorDescriptor_t c) {
+inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a,
+                                  infiniopTensorDescriptor_t b,
+                                  infiniopTensorDescriptor_t c) {
    return isValidBroadcastShape(a, b, c, std::max(a->ndim, b->ndim));
 }

@@ -120,7 +128,8 @@ inline size_t get_byte_size(infiniopTensorDescriptor_t desc) {
 }

 // permute the dimensions of a tensor descriptor
-inline infiniopTensorDescriptor_t permute(infiniopTensorDescriptor_t desc, const std::vector<size_t> &order) {
+inline infiniopTensorDescriptor_t permute(infiniopTensorDescriptor_t desc,
+                                          const std::vector<size_t> &order) {
    size_t ndim = desc->ndim;
    if (order.size() != ndim) {
        return nullptr;
@@ -134,14 +143,16 @@ inline infiniopTensorDescriptor_t permute(infiniopTensorDescriptor_t desc, const
        shape[i] = desc->shape[order[i]];
        strides[i] = desc->strides[order[i]];
    }
-    return new InfiniopTensorDescriptor{
-        desc->dtype, ndim, shape, strides};
+    return new InfiniopTensorDescriptor{desc->dtype, ndim, shape, strides};
 }

-// check if the dimensions [dim_start, dim_end] of a tensor descriptor are contiguous
-inline bool is_contiguous(const infiniopTensorDescriptor_t &desc, size_t dim_start, size_t dim_end) {
+// check if the dimensions [dim_start, dim_end] of a tensor descriptor are
+// contiguous
+inline bool is_contiguous(const infiniopTensorDescriptor_t &desc,
+                          size_t dim_start, size_t dim_end) {
    for (size_t i = dim_start + 1; i <= dim_end; i++) {
-        if (desc->strides[i - 1] != static_cast<int64_t>(desc->shape[i]) * desc->strides[i]) {
+        if (desc->strides[i - 1] !=
+            static_cast<int64_t>(desc->shape[i]) * desc->strides[i]) {
            return false;
        }
    }
@@ -156,7 +167,8 @@ inline bool is_contiguous(const infiniopTensorDescriptor_t &desc) {
 }

 // merge the dimensions [dim_start, dim_end] of a tensor descriptor
-inline infiniopTensorDescriptor_t dim_merge(infiniopTensorDescriptor_t desc, size_t dim_start, size_t dim_end) {
+inline infiniopTensorDescriptor_t dim_merge(infiniopTensorDescriptor_t desc,
+                                            size_t dim_start, size_t dim_end) {
    size_t ndim = desc->ndim;
    if (dim_start > dim_end || dim_end >= ndim) {
        return nullptr;
@@ -185,14 +197,17 @@ inline infiniopTensorDescriptor_t dim_merge(infiniopTensorDescriptor_t desc, siz
        new_strides[index] = desc->strides[i];
        index++;
    }
-    return new InfiniopTensorDescriptor{
-        desc->dtype, new_ndim, new_shape, new_strides};
+    return new InfiniopTensorDescriptor{desc->dtype, new_ndim, new_shape,
+                                        new_strides};
 }

 // split the dimension dim of a tensor descriptor into multiple dimensions
-inline infiniopTensorDescriptor_t dim_split(infiniopTensorDescriptor_t desc, size_t dim, const std::vector<size_t> &dims) {
+inline infiniopTensorDescriptor_t dim_split(infiniopTensorDescriptor_t desc,
+                                            size_t dim,
+                                            const std::vector<size_t> &dims) {
    size_t ndim = desc->ndim;
-    if (desc->shape[dim] != std::accumulate(dims.begin(), dims.end(), (size_t)1, std::multiplies{})) {
+    if (desc->shape[dim] != std::accumulate(dims.begin(), dims.end(), (size_t)1,
+                                            std::multiplies{})) {
        return nullptr;
    }
    size_t new_ndim = ndim + dims.size() - 1;
@@ -206,7 +221,10 @@ inline infiniopTensorDescriptor_t dim_split(infiniopTensorDescriptor_t desc, siz
    }
    for (size_t i = 0; i < dims.size(); i++) {
        new_shape[index] = dims[i];
-        new_strides[index] = desc->strides[dim] * desc->shape[dim] / std::accumulate(dims.begin(), dims.begin() + i + 1, 1, std::multiplies<size_t>());
+        new_strides[index] =
+            desc->strides[dim] * desc->shape[dim] /
+            std::accumulate(dims.begin(), dims.begin() + i + 1, (size_t)1,
+                            std::multiplies<size_t>());
        index++;
    }
    for (size_t i = dim + 1; i < ndim; i++) {
@@ -214,8 +232,8 @@ inline infiniopTensorDescriptor_t dim_split(infiniopTensorDescriptor_t desc, siz
        new_strides[index] = desc->strides[i];
        index++;
    }
-    return new InfiniopTensorDescriptor{
-        desc->dtype, new_ndim, new_shape, new_strides};
+    return new InfiniopTensorDescriptor{desc->dtype, new_ndim, new_shape,
+                                        new_strides};
 }

-#endif// __UTILS_H__
+#endif // __UTILS_H__
--- a/xmake/cpu.lua
+++ b/xmake/cpu.lua
@@ -2,6 +2,8 @@ target("infiniop-cpu")
    on_install(function (target) end)
    set_kind("static")

+    add_cxflags("-Wall", "-Werror")
+
    if not is_plat("windows") then
        add_cxflags("-fPIC")
    end

--- a/xmake/cuda.lua
+++ b/xmake/cuda.lua
@@ -20,10 +20,12 @@ target("infiniop-cuda")

    if is_plat("windows") then
        add_cuflags("-Xcompiler=/utf-8", "--expt-relaxed-constexpr", "--allow-unsupported-compiler")
+        add_cuflags("-Xcompiler=/W3", "-Xcompiler=/WX")
        if CUDNN_ROOT ~= nil then
            add_linkdirs(CUDNN_ROOT .. "\\lib\\x64")
        end
    else
+        add_cuflags("-Xcompiler=-Wall", "-Xcompiler=-Werror")
        add_cuflags("-Xcompiler=-fPIC")
        add_culdflags("-Xcompiler=-fPIC")
        add_cxxflags("-fPIC")