Merge branch 'main' into issue/300

0166515c · PanZezhong1725 · GitHub · f0300ff3 · a23c4d13 · 0166515c
Unverified Commit 0166515c authored Aug 07, 2025 by PanZezhong1725 Committed by GitHub Aug 07, 2025
20 changed files
--- a/src/infiniop/ops/relu/nvidia/relu_nvidia.cu
+++ b/src/infiniop/ops/relu/nvidia/relu_nvidia.cu
+#ifdef ENABLE_NINETOOTHED
+
+#include "../../../../../build/ninetoothed/relu.h"
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "relu_nvidia.cuh"
+
+namespace op::relu::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    const auto &ndim{_info.getNdim()};
+    const auto &x_shape_{_info.getInputShape(0)};
+    const auto &x_strides_{_info.getInputStrides(0)};
+    std::vector<uint64_t> x_shape_vec{x_shape_, x_shape_ + ndim};
+    std::vector<int64_t> x_strides_vec{x_strides_, x_strides_ + ndim};
+    auto x_data{const_cast<void *>(inputs[0])};
+    auto x_shape{x_shape_vec.data()};
+    auto x_strides{x_strides_vec.data()};
+    const NineToothedTensor x{x_data, x_shape, x_strides};
+    const auto &y_shape_{_info.getOutputShape()};
+    const auto &y_strides_{_info.getOutputStrides()};
+    std::vector<uint64_t> y_shape_vec{y_shape_, y_shape_ + ndim};
+    std::vector<int64_t> y_strides_vec{y_strides_, y_strides_ + ndim};
+    auto y_data{output};
+    auto y_shape{y_shape_vec.data()};
+    auto y_strides{y_strides_vec.data()};
+    const NineToothedTensor y{y_data, y_shape, y_strides};
+    constexpr auto block_size{1024};
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+    case INFINI_DTYPE_F32:
+    case INFINI_DTYPE_F64:
+    case INFINI_DTYPE_BF16:
+        if (launch_relu(stream, x, y, ndim, _dtype, block_size)) {
+            return INFINI_STATUS_INTERNAL_ERROR;
+        }
+        return INFINI_STATUS_SUCCESS;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::relu::nvidia
+
+#endif
--- a/src/infiniop/ops/relu/nvidia/relu_nvidia.cuh
+++ b/src/infiniop/ops/relu/nvidia/relu_nvidia.cuh
+#ifndef __RELU_NVIDIA_API_H__
+#define __RELU_NVIDIA_API_H__
+
+#ifdef ENABLE_NINETOOTHED
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(relu, nvidia)
+
+#endif
+
+#endif // __RELU_NVIDIA_API_H__
--- a/src/infiniop/ops/relu/operator.cc
+++ b/src/infiniop/ops/relu/operator.cc
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/relu.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/relu_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#ifdef ENABLE_NINETOOTHED
+#include "nvidia/relu_nvidia.cuh"
+#endif
+#endif
+#ifdef ENABLE_METAX_API
+#ifdef ENABLE_NINETOOTHED
+#include "metax/relu_metax.h"
+#endif
+#endif
+
+__C infiniStatus_t infiniopCreateReluDescriptor(
+    infiniopHandle_t handle,
+    infiniopReluDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::relu::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::relu::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                         \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+#ifdef ENABLE_NINETOOTHED
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#endif
+#ifdef ENABLE_ILUVATAR_API
+#ifdef ENABLE_NINETOOTHED
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#endif
+#ifdef ENABLE_METAX_API
+#ifdef ENABLE_NINETOOTHED
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetReluWorkspaceSize(infiniopReluDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::relu::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+#ifdef ENABLE_NINETOOTHED
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#endif
+#ifdef ENABLE_ILUVATAR_API
+#ifdef ENABLE_NINETOOTHED
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#endif
+#ifdef ENABLE_METAX_API
+#ifdef ENABLE_NINETOOTHED
+        GET(INFINI_DEVICE_METAX, metax)
+#endif
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopRelu(
+    infiniopReluDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::relu::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+#ifdef ENABLE_NINETOOTHED
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#endif
+#ifdef ENABLE_ILUVATAR_API
+#ifdef ENABLE_NINETOOTHED
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#endif
+#ifdef ENABLE_METAX_API
+#ifdef ENABLE_NINETOOTHED
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyReluDescriptor(infiniopReluDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::relu::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+#ifdef ENABLE_NINETOOTHED
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#endif
+#ifdef ENABLE_ILUVATAR_API
+#ifdef ENABLE_NINETOOTHED
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#endif
+#ifdef ENABLE_METAX_API
+#ifdef ENABLE_NINETOOTHED
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
--- a/src/infiniop/ops/rms_norm/cuda/rms_norm_kernel.cuh
+++ b/src/infiniop/ops/rms_norm/cuda/rms_norm_kernel.cuh
 #ifndef __RMS_NORM_CUDA_KERNEL_H__
 #define __RMS_NORM_CUDA_KERNEL_H__

-#include "../../../devices/cuda/cuda_kernel_common.cuh"
-#include "../../../reduce/cuda/reduce.cuh"
-
-template <unsigned int BLOCK_SIZE, typename Tdata, typename Tweight, typename Tcompute>
-INFINIOP_CUDA_KERNEL rmsnormBlock(
+template <unsigned int BLOCK_SIZE, typename Tcompute, typename Tdata, typename Tweight>
+__device__ void rmsnormBlock(
    Tdata *__restrict__ y,
    ptrdiff_t stride_y,
    const Tdata *__restrict__ x,

--- a/src/infiniop/ops/rms_norm/maca/rms_norm_maca.cuh
+++ b/src/infiniop/ops/rms_norm/maca/rms_norm_maca.cuh
-#ifndef __RMS_NORM_MACA_CUH__
-#define __RMS_NORM_MACA_CUH__
+#ifndef __RMS_NORM_METAX_CUH__
+#define __RMS_NORM_METAX_CUH__

 #include "../rms_norm.h"

-DESCRIPTOR(maca)
+DESCRIPTOR(metax)

 #endif
--- a/src/infiniop/ops/rms_norm/maca/rms_norm_maca.maca
+++ b/src/infiniop/ops/rms_norm/maca/rms_norm_maca.maca
-#include "../../../devices/maca/common_maca.h"
-#include "../cuda/rms_norm_kernel.cuh"
-#include "rms_norm_maca.cuh"
+#include "../../../devices/metax/metax_common.h"
+#include "rms_norm_metax.cuh"

-namespace op::rms_norm::maca {
+#include "../../../devices/metax/metax_kernel_common.h"
+#include <cub/block/block_reduce.cuh>
+
+#include "../../../reduce/cuda/reduce.cuh"
+
+#include "../cuda/kernel.cuh"
+
+template <unsigned int BLOCK_SIZE, typename Tcompute, typename Tdata, typename Tweight>
+INFINIOP_METAX_KERNEL rmsnormKernel(
+    Tdata *__restrict__ y,
+    ptrdiff_t stride_y,
+    const Tdata *__restrict__ x,
+    ptrdiff_t stride_x,
+    const Tweight *__restrict__ w,
+    size_t dim,
+    float epsilon) {
+    rmsnormBlock<BLOCK_SIZE, Tcompute>(y, stride_y, x, stride_x, w, dim, epsilon);
+}
+
+namespace op::rms_norm::metax {

 struct Descriptor::Opaque {
-    std::shared_ptr<device::maca::Handle::Internal> internal;
+    std::shared_ptr<device::metax::Handle::Internal> internal;
 };

 Descriptor::~Descriptor() {
@@ -29,7 +47,7 @@ infiniStatus_t Descriptor::create(
    }

    *desc_ptr = new Descriptor(
-        new Opaque{reinterpret_cast<device::maca::Handle *>(handle)->internal()},
+        new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
        std::move(info),
        0,
        handle->device, handle->device_id);
@@ -44,20 +62,24 @@ infiniStatus_t launchKernel(
    const void *x, ptrdiff_t stride_x,
    const void *w, infiniDtype_t wtype,
    float epsilon,
-    hcStream_t maca_stream) {
-
-#define LAUNCH_KERNEL(Tdata, Tweight, Tcompute)                                                     \
-    rmsnormBlock<BLOCK_SIZE, Tdata, Tweight, Tcompute><<<batch_size, BLOCK_SIZE, 0, maca_stream>>>( \
-        reinterpret_cast<Tdata *>(y),                                                               \
-        stride_y,                                                                                   \
-        reinterpret_cast<const Tdata *>(x),                                                         \
-        stride_x,                                                                                   \
-        reinterpret_cast<const Tweight *>(w),                                                       \
-        dim,                                                                                        \
+    hcStream_t stream) {
+
+#define LAUNCH_KERNEL(Tdata, Tweight, Tcompute)                                                      \
+    rmsnormKernel<BLOCK_SIZE, Tcompute, Tdata, Tweight><<<batch_size, BLOCK_SIZE, 0, stream>>>( \
+        reinterpret_cast<Tdata *>(y),                                                                \
+        stride_y,                                                                                    \
+        reinterpret_cast<const Tdata *>(x),                                                          \
+        stride_x,                                                                                    \
+        reinterpret_cast<const Tweight *>(w),                                                        \
+        dim,                                                                                         \
        epsilon)

    if (atype == INFINI_DTYPE_F16 && wtype == INFINI_DTYPE_F16) {
        LAUNCH_KERNEL(half, half, float);
+    } else if (atype == INFINI_DTYPE_BF16 && wtype == INFINI_DTYPE_BF16) {
+        LAUNCH_KERNEL(__hpcc_bfloat16, __hpcc_bfloat16, float);
+    } else if (atype == INFINI_DTYPE_BF16 && wtype == INFINI_DTYPE_F32) {
+        LAUNCH_KERNEL(__hpcc_bfloat16, float, float);
    } else if (atype == INFINI_DTYPE_F16 && wtype == INFINI_DTYPE_F32) {
        LAUNCH_KERNEL(half, float, float);
    } else if (atype == INFINI_DTYPE_F32 && wtype == INFINI_DTYPE_F32) {
@@ -74,7 +96,7 @@ infiniStatus_t launchKernel(
 infiniStatus_t Descriptor::calculate(
    void *workspace, size_t workspace_size,
    void *y, const void *x, const void *w,
-    void *stream) const {
+    void *stream_) const {

    if (workspace_size < _workspace_size) {
        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
@@ -84,14 +106,14 @@ infiniStatus_t Descriptor::calculate(
    auto stride_y = _info.y_strides[0];
    auto dim = _info.dim();
    uint32_t batch_size = static_cast<uint32_t>(_info.shape[0]);
-    auto maca_stream = reinterpret_cast<hcStream_t>(stream);
+    auto stream = reinterpret_cast<hcStream_t>(stream_);

    // launch kernel with different block sizes
-    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
-        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_1024>(batch_size, dim, y, _info.atype, stride_y, x, stride_x, w, _info.wtype, _info.epsilon, maca_stream));
+    if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) {
+        CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_1024>(batch_size, dim, y, _info.atype, stride_y, x, stride_x, w, _info.wtype, _info.epsilon, stream));
    } else {
        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
    }
    return INFINI_STATUS_SUCCESS;
 }
-} // namespace op::rms_norm::maca
+} // namespace op::rms_norm::metax
--- a/src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cu
+++ b/src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cu
-#include "../../../devices/cuda/cuda_common.cuh"
-#include "rms_norm_cuda.cuh"
-#include "rms_norm_kernel.cuh"
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "rms_norm_nvidia.cuh"

-namespace op::rms_norm::cuda {
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include <cub/block/block_reduce.cuh>
+
+#include "../../../reduce/cuda/reduce.cuh"
+
+#include "../cuda/kernel.cuh"
+
+template <unsigned int BLOCK_SIZE, typename Tcompute, typename Tdata, typename Tweight>
+INFINIOP_CUDA_KERNEL rmsnormKernel(
+    Tdata *__restrict__ y,
+    ptrdiff_t stride_y,
+    const Tdata *__restrict__ x,
+    ptrdiff_t stride_x,
+    const Tweight *__restrict__ w,
+    size_t dim,
+    float epsilon) {
+    rmsnormBlock<BLOCK_SIZE, Tcompute>(y, stride_y, x, stride_x, w, dim, epsilon);
+}
+
+namespace op::rms_norm::nvidia {

 struct Descriptor::Opaque {
-    std::shared_ptr<device::cuda::Handle::Internal> internal;
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
 };

 Descriptor::~Descriptor() {
@@ -29,7 +47,7 @@ infiniStatus_t Descriptor::create(
    }

    *desc_ptr = new Descriptor(
-        new Opaque{reinterpret_cast<device::cuda::Handle *>(handle)->internal()},
+        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
        std::move(info),
        0,
        handle->device, handle->device_id);
@@ -46,14 +64,14 @@ infiniStatus_t launchKernel(
    float epsilon,
    cudaStream_t cuda_stream) {

-#define LAUNCH_KERNEL(Tdata, Tweight, Tcompute)                                                     \
-    rmsnormBlock<BLOCK_SIZE, Tdata, Tweight, Tcompute><<<batch_size, BLOCK_SIZE, 0, cuda_stream>>>( \
-        reinterpret_cast<Tdata *>(y),                                                               \
-        stride_y,                                                                                   \
-        reinterpret_cast<const Tdata *>(x),                                                         \
-        stride_x,                                                                                   \
-        reinterpret_cast<const Tweight *>(w),                                                       \
-        dim,                                                                                        \
+#define LAUNCH_KERNEL(Tdata, Tweight, Tcompute)                                                      \
+    rmsnormKernel<BLOCK_SIZE, Tcompute, Tdata, Tweight><<<batch_size, BLOCK_SIZE, 0, cuda_stream>>>( \
+        reinterpret_cast<Tdata *>(y),                                                                \
+        stride_y,                                                                                    \
+        reinterpret_cast<const Tdata *>(x),                                                          \
+        stride_x,                                                                                    \
+        reinterpret_cast<const Tweight *>(w),                                                        \
+        dim,                                                                                         \
        epsilon)

    if (atype == INFINI_DTYPE_F16 && wtype == INFINI_DTYPE_F16) {
@@ -102,4 +120,4 @@ infiniStatus_t Descriptor::calculate(
    }
    return INFINI_STATUS_SUCCESS;
 }
-} // namespace op::rms_norm::cuda
+} // namespace op::rms_norm::nvidia
--- a/src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cuh
+++ b/src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cuh
@@ -3,6 +3,6 @@

 #include "../rms_norm.h"

-DESCRIPTOR(cuda)
+DESCRIPTOR(nvidia)

 #endif
--- a/src/infiniop/ops/rms_norm/operator.cc
+++ b/src/infiniop/ops/rms_norm/operator.cc
@@ -5,14 +5,14 @@
 #ifdef ENABLE_CPU_API
 #include "cpu/rms_norm_cpu.h"
 #endif
-#ifdef ENABLE_CUDA_API
-#include "cuda/rms_norm_cuda.cuh"
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/rms_norm_nvidia.cuh"
 #endif
 #ifdef ENABLE_ASCEND_API
 #include "ascend/rms_norm_aclnn.h"
 #endif
 #ifdef ENABLE_METAX_API
-#include "maca/rms_norm_maca.cuh"
+#include "metax/rms_norm_metax.cuh"
 #endif
 #ifdef ENABLE_MOORE_API
 #include "musa/rms_norm_musa.cuh"
@@ -37,17 +37,20 @@ __C infiniStatus_t infiniopCreateRMSNormDescriptor(
            y_desc,                                                             \
            x_desc,                                                             \
            w_desc,                                                             \
-            epsilon);
+            epsilon)

    switch (handle->device) {
 #ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu)
+        CREATE(INFINI_DEVICE_CPU, cpu);
 #endif
-#ifdef ENABLE_CUDA_API
-        CREATE(INFINI_DEVICE_NVIDIA, cuda)
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
 #ifdef ENABLE_KUNLUN_API
-        CREATE(INFINI_DEVICE_KUNLUN, kunlun)
+        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {
@@ -55,13 +58,13 @@ __C infiniStatus_t infiniopCreateRMSNormDescriptor(
    }
 #endif
 #ifdef ENABLE_ASCEND_API
-        CREATE(INFINI_DEVICE_ASCEND, ascend)
+        CREATE(INFINI_DEVICE_ASCEND, ascend);
 #endif
 #ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, maca)
+        CREATE(INFINI_DEVICE_METAX, metax);
 #endif
 #ifdef ENABLE_MOORE_API
-        CREATE(INFINI_DEVICE_MOORE, musa)
+        CREATE(INFINI_DEVICE_MOORE, musa);
 #endif
    }

@@ -75,17 +78,20 @@ __C infiniStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t d
 #define GET(CASE, NAMESPACE)                                                                    \
    case CASE:                                                                                  \
        *size = reinterpret_cast<op::rms_norm::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
+        return INFINI_STATUS_SUCCESS

    switch (desc->device_type) {
 #ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
-#ifdef ENABLE_CUDA_API
-        GET(INFINI_DEVICE_NVIDIA, cuda)
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
 #ifdef ENABLE_KUNLUN_API
-        GET(INFINI_DEVICE_KUNLUN, kunlun)
+        GET(INFINI_DEVICE_KUNLUN, kunlun);
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {
@@ -93,13 +99,13 @@ __C infiniStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t d
    }
 #endif
 #ifdef ENABLE_ASCEND_API
-        GET(INFINI_DEVICE_ASCEND, ascend)
+        GET(INFINI_DEVICE_ASCEND, ascend);
 #endif
 #ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, maca)
+        GET(INFINI_DEVICE_METAX, metax);
 #endif
 #ifdef ENABLE_MOORE_API
-        GET(INFINI_DEVICE_MOORE, musa)
+        GET(INFINI_DEVICE_MOORE, musa);
 #endif
    }

@@ -114,17 +120,20 @@ __C infiniStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *works
 #define CALCULATE(CASE, NAMESPACE)                                                       \
    case CASE:                                                                           \
        return reinterpret_cast<op::rms_norm::NAMESPACE::Descriptor *>(desc)->calculate( \
-            workspace, workspace_size, y, x, w, stream);
+            workspace, workspace_size, y, x, w, stream)

    switch (desc->device_type) {
 #ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu)
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
 #endif
-#ifdef ENABLE_CUDA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, cuda)
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
 #ifdef ENABLE_KUNLUN_API
-        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun)
+        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {
@@ -132,13 +141,13 @@ __C infiniStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *works
    }
 #endif
 #ifdef ENABLE_ASCEND_API
-        CALCULATE(INFINI_DEVICE_ASCEND, ascend)
+        CALCULATE(INFINI_DEVICE_ASCEND, ascend);
 #endif
 #ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, maca)
+        CALCULATE(INFINI_DEVICE_METAX, metax);
 #endif
 #ifdef ENABLE_MOORE_API
-        CALCULATE(INFINI_DEVICE_MOORE, musa)
+        CALCULATE(INFINI_DEVICE_MOORE, musa);
 #endif
    }

@@ -152,17 +161,20 @@ __C infiniStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t
 #define DESTROY(CASE, NAMESPACE)                                              \
    case CASE:                                                                \
        delete reinterpret_cast<op::rms_norm::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS;
+        return INFINI_STATUS_SUCCESS

    switch (desc->device_type) {
 #ifdef ENABLE_CPU_API
-        DESTROY(INFINI_DEVICE_CPU, cpu)
+        DESTROY(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DESTROY(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
-#ifdef ENABLE_CUDA_API
-        DESTROY(INFINI_DEVICE_NVIDIA, cuda)
+#ifdef ENABLE_ILUVATAR_API
+        DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
 #ifdef ENABLE_KUNLUN_API
-        DESTROY(INFINI_DEVICE_KUNLUN, kunlun)
+        DESTROY(INFINI_DEVICE_KUNLUN, kunlun);
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {
@@ -170,13 +182,13 @@ __C infiniStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t
    }
 #endif
 #ifdef ENABLE_ASCEND_API
-        DESTROY(INFINI_DEVICE_ASCEND, ascend)
+        DESTROY(INFINI_DEVICE_ASCEND, ascend);
 #endif
 #ifdef ENABLE_METAX_API
-        DESTROY(INFINI_DEVICE_METAX, maca)
+        DESTROY(INFINI_DEVICE_METAX, metax);
 #endif
 #ifdef ENABLE_MOORE_API
-        DESTROY(INFINI_DEVICE_MOORE, musa)
+        DESTROY(INFINI_DEVICE_MOORE, musa);
 #endif
    }


--- a/src/infiniop/ops/rope/cuda/rope_cuda_kernel.cuh
+++ b/src/infiniop/ops/rope/cuda/rope_cuda_kernel.cuh
 #ifndef __INFINIOP_ROPE_CUDA_KERNEL_CUH__
 #define __INFINIOP_ROPE_CUDA_KERNEL_CUH__

-#include "../../../devices/cuda/cuda_kernel_common.cuh"
-
 template <typename Tdata, typename Tindex, typename Tangle>
-INFINIOP_CUDA_KERNEL ropeThreadPerItem(
+__device__ void ropeThreadPerItemBlock(
    Tdata *y_,
    const Tdata *x_,
    const Tindex *__restrict__ pos_ids,
@@ -30,9 +28,9 @@ INFINIOP_CUDA_KERNEL ropeThreadPerItem(
            Tangle y0 = x.x * cos__ - x.y * sin__,
                   y1 = x.x * sin__ + x.y * cos__;
            y = half2(y0, y1);
-        } else if constexpr (std::is_same<Tdata, __nv_bfloat16>::value) {
-            auto &y = reinterpret_cast<__nv_bfloat162 &>(y_[y_offset + 2 * i]);
-            auto &x = reinterpret_cast<const __nv_bfloat162 &>(x_[x_offset + 2 * i]);
+        } else if constexpr (std::is_same<Tdata, cuda_bfloat16>::value) {
+            auto &y = reinterpret_cast<cuda_bfloat162 &>(y_[y_offset + 2 * i]);
+            auto &x = reinterpret_cast<const cuda_bfloat162 &>(x_[x_offset + 2 * i]);

            Tangle x0 = __low2bfloat16(x);
            Tangle x1 = __high2bfloat16(x);

--- a/src/infiniop/ops/rope/maca/rope_maca.h
+++ b/src/infiniop/ops/rope/maca/rope_maca.h
-#ifndef __INFINIOP_ROPE_MACA_H__
-#define __INFINIOP_ROPE_MACA_H__
-
-#include "../rope.h"
-
-DESCRIPTOR(maca)
-
-#endif // __INFINIOP_ROPE_MACA_H__
--- a/src/infiniop/ops/rope/maca/rope_maca_kernel.h
+++ b/src/infiniop/ops/rope/maca/rope_maca_kernel.h
-#ifndef __INFINIOP_ROPE_MACA_KERNEL_H__
-#define __INFINIOP_ROPE_MACA_KERNEL_H__
-
-#include "../../../devices/maca/maca_kernel_common.h"
-
-template <typename Tdata, typename Tindex, typename Tangle>
-INFINIOP_MACA_KERNEL ropeThreadPerItem(
-    Tdata *y_,
-    const Tdata *x_,
-    const Tindex *__restrict__ pos_ids,
-    const Tangle *__restrict__ sin_table,
-    const Tangle *__restrict__ cos_table,
-    size_t table_dim,
-    ptrdiff_t y_stride_seqlen,
-    ptrdiff_t y_stride_nhead,
-    ptrdiff_t x_stride_seqlen,
-    ptrdiff_t x_stride_nhead) {
-
-    auto y_offset = blockIdx.x * y_stride_seqlen + blockIdx.y * y_stride_nhead;
-    auto x_offset = blockIdx.x * x_stride_seqlen + blockIdx.y * x_stride_nhead;
-    size_t pos_id = size_t(pos_ids[blockIdx.x]);
-    auto table_offset = pos_id * table_dim;
-
-    for (size_t i = threadIdx.x; i < table_dim; i += blockDim.x) {
-        Tangle sin__ = sin_table[table_offset + i],
-               cos__ = cos_table[table_offset + i];
-        if constexpr (std::is_same<Tdata, half>::value) {
-            auto &y = reinterpret_cast<half2 &>(y_[y_offset + 2 * i]);
-            auto &x = reinterpret_cast<const half2 &>(x_[x_offset + 2 * i]);
-            Tangle y0 = x.x * cos__ - x.y * sin__,
-                   y1 = x.x * sin__ + x.y * cos__;
-            y = half2(y0, y1);
-        } else {
-            Tangle x0 = x_[x_offset + 2 * i],
-                   x1 = x_[x_offset + 2 * i + 1];
-            y_[y_offset + 2 * i] = Tdata(x0 * cos__ - x1 * sin__);
-            y_[y_offset + 2 * i + 1] = Tdata(x0 * sin__ + x1 * cos__);
-        }
-    }
-}
-
-#endif
--- a/src/infiniop/ops/rope/metax/rope_metax.h
+++ b/src/infiniop/ops/rope/metax/rope_metax.h
+#ifndef __INFINIOP_ROPE_METAX_H__
+#define __INFINIOP_ROPE_METAX_H__
+
+#include "../rope.h"
+
+DESCRIPTOR(metax)
+
+#endif // __INFINIOP_ROPE_METAX_H__
--- a/src/infiniop/ops/rope/maca/rope_maca.maca
+++ b/src/infiniop/ops/rope/maca/rope_maca.maca
-#include "../../../devices/maca/common_maca.h"
-#include "rope_maca.h"
-#include "rope_maca_kernel.h"
+#include "../../../devices/metax/metax_common.h"
+#include "rope_metax.h"
+
+#include "../../../devices/metax/metax_kernel_common.h"
+
+#include "../cuda/kernel.cuh"
+
+template <typename Tdata, typename Tindex, typename Tangle>
+INFINIOP_METAX_KERNEL ropeThreadPerItemKernel(
+    Tdata *y_,
+    const Tdata *x_,
+    const Tindex *__restrict__ pos_ids,
+    const Tangle *__restrict__ sin_table,
+    const Tangle *__restrict__ cos_table,
+    size_t table_dim,
+    ptrdiff_t y_stride_seqlen,
+    ptrdiff_t y_stride_nhead,
+    ptrdiff_t x_stride_seqlen,
+    ptrdiff_t x_stride_nhead) {
+    ropeThreadPerItemBlock(
+        y_, x_, pos_ids,
+        sin_table, cos_table,
+        table_dim,
+        y_stride_seqlen, y_stride_nhead,
+        x_stride_seqlen, x_stride_nhead);
+}

-namespace op::rope::maca {
+namespace op::rope::metax {

 struct Descriptor::Opaque {
-    std::shared_ptr<device::maca::Handle::Internal> internal;
+    std::shared_ptr<device::metax::Handle::Internal> internal;
 };

 Descriptor::~Descriptor() {
@@ -21,7 +44,7 @@ infiniStatus_t Descriptor::create(
    infiniopTensorDescriptor_t sin_desc,
    infiniopTensorDescriptor_t cos_desc) {

-    auto handle = reinterpret_cast<device::maca::Handle *>(handle_);
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);

    auto info = RoPEInfo::createRoPEInfo(y_desc, x_desc, pos_desc, sin_desc, cos_desc);
    CHECK_RESULT(info);
@@ -30,7 +53,7 @@ infiniStatus_t Descriptor::create(
    *desc_ptr = new Descriptor(
        info.take(),
        0,
-        new Opaque{reinterpret_cast<device::maca::Handle *>(handle)->internal()},
+        new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
        handle->device,
        handle->device_id);

@@ -50,7 +73,7 @@ infiniStatus_t calculateRoPE(const RoPEInfo &info,
         dimy = uint32_t(info.nhead);
    int nthreads = std::max(int(info.table_dim), block_size);

-    ropeThreadPerItem<<<dim3(dimx, dimy), nthreads, 0, stream>>>(
+    ropeThreadPerItemKernel<<<dim3(dimx, dimy), nthreads, 0, stream>>>(
        y, x, pos_ids, sin_table, cos_table, info.table_dim,
        info.y_stride_seqlen, info.y_stride_nhead, info.x_stride_seqlen, info.x_stride_nhead);

@@ -102,6 +125,8 @@ infiniStatus_t Descriptor::calculate(
    switch (_info.data_type) {
    case INFINI_DTYPE_F16:
        ROPE_TYPE(half);
+    case INFINI_DTYPE_BF16:
+        ROPE_TYPE(cuda_bfloat16);
    case INFINI_DTYPE_F32:
        ROPE_TYPE(float);
    case INFINI_DTYPE_F64:
@@ -116,4 +141,4 @@ infiniStatus_t Descriptor::calculate(
 #undef ROPE_TYPE
 #undef CALCULATE_ROPE

-} // namespace op::rope::maca
+} // namespace op::rope::metax
--- a/src/infiniop/ops/rope/cuda/rope_cuda.cu
+++ b/src/infiniop/ops/rope/cuda/rope_cuda.cu
-#include "../../../devices/cuda/cuda_common.cuh"
-#include "rope_cuda.cuh"
-#include "rope_cuda_kernel.cuh"
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "rope_nvidia.cuh"
+
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+
+#include "../cuda/kernel.cuh"
+
+template <typename Tdata, typename Tindex, typename Tangle>
+INFINIOP_CUDA_KERNEL ropeThreadPerItemKernel(
+    Tdata *y_,
+    const Tdata *x_,
+    const Tindex *__restrict__ pos_ids,
+    const Tangle *__restrict__ sin_table,
+    const Tangle *__restrict__ cos_table,
+    size_t table_dim,
+    ptrdiff_t y_stride_seqlen,
+    ptrdiff_t y_stride_nhead,
+    ptrdiff_t x_stride_seqlen,
+    ptrdiff_t x_stride_nhead) {
+    ropeThreadPerItemBlock(
+        y_, x_, pos_ids,
+        sin_table, cos_table,
+        table_dim,
+        y_stride_seqlen, y_stride_nhead,
+        x_stride_seqlen, x_stride_nhead);
+}

-namespace op::rope::cuda {
+namespace op::rope::nvidia {

 struct Descriptor::Opaque {
-    std::shared_ptr<device::cuda::Handle::Internal> internal;
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
 };

 Descriptor::~Descriptor() {
@@ -21,7 +44,7 @@ infiniStatus_t Descriptor::create(
    infiniopTensorDescriptor_t sin_desc,
    infiniopTensorDescriptor_t cos_desc) {

-    auto handle = reinterpret_cast<device::cuda::Handle *>(handle_);
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);

    auto info = RoPEInfo::createRoPEInfo(y_desc, x_desc, pos_desc, sin_desc, cos_desc);
    CHECK_RESULT(info);
@@ -30,7 +53,7 @@ infiniStatus_t Descriptor::create(
    *desc_ptr = new Descriptor(
        info.take(),
        0,
-        new Opaque{reinterpret_cast<device::cuda::Handle *>(handle)->internal()},
+        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
        handle->device,
        handle->device_id);

@@ -50,7 +73,7 @@ infiniStatus_t calculateRoPE(const RoPEInfo &info,
         dimy = uint32_t(info.nhead);
    int nthreads = std::max(int(info.table_dim), block_size);

-    ropeThreadPerItem<<<dim3(dimx, dimy), nthreads, 0, stream>>>(
+    ropeThreadPerItemKernel<<<dim3(dimx, dimy), nthreads, 0, stream>>>(
        y, x, pos_ids, sin_table, cos_table, info.table_dim,
        info.y_stride_seqlen, info.y_stride_nhead, info.x_stride_seqlen, info.x_stride_nhead);

@@ -103,7 +126,7 @@ infiniStatus_t Descriptor::calculate(
    case INFINI_DTYPE_F16:
        ROPE_TYPE(half);
    case INFINI_DTYPE_BF16:
-        ROPE_TYPE(__nv_bfloat16);
+        ROPE_TYPE(cuda_bfloat16);
    case INFINI_DTYPE_F32:
        ROPE_TYPE(float);
    case INFINI_DTYPE_F64:
@@ -118,4 +141,4 @@ infiniStatus_t Descriptor::calculate(
 #undef ROPE_TYPE
 #undef CALCULATE_ROPE

-} // namespace op::rope::cuda
+} // namespace op::rope::nvidia
--- a/src/infiniop/ops/rope/cuda/rope_cuda.cuh
+++ b/src/infiniop/ops/rope/cuda/rope_cuda.cuh
@@ -3,6 +3,6 @@

 #include "../rope.h"

-DESCRIPTOR(cuda)
+DESCRIPTOR(nvidia)

 #endif // __INFINIOP_ROPE_CUDA_H__
--- a/src/infiniop/ops/rope/operator.cc
+++ b/src/infiniop/ops/rope/operator.cc
@@ -5,14 +5,14 @@
 #ifdef ENABLE_CPU_API
 #include "cpu/rope_cpu.h"
 #endif
-#ifdef ENABLE_CUDA_API
-#include "cuda/rope_cuda.cuh"
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/rope_nvidia.cuh"
 #endif
 #ifdef ENABLE_ASCEND_API
 #include "ascend/rope_ascend.h"
 #endif
 #ifdef ENABLE_METAX_API
-#include "maca/rope_maca.h"
+#include "metax/rope_metax.h"
 #endif

 __C infiniStatus_t infiniopCreateRoPEDescriptor(
@@ -39,11 +39,17 @@ __C infiniStatus_t infiniopCreateRoPEDescriptor(
 #ifdef ENABLE_CPU_API
        CREATE(INFINI_DEVICE_CPU, cpu);
 #endif
-#ifdef ENABLE_CUDA_API
-        CREATE(INFINI_DEVICE_NVIDIA, cuda);
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
 #ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, maca);
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ASCEND_API
+        CREATE(INFINI_DEVICE_ASCEND, ascend);
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {
@@ -52,16 +58,6 @@ __C infiniStatus_t infiniopCreateRoPEDescriptor(
                                        pos_ids, sin_table, cos_table);
    }
 #endif
-#ifdef ENABLE_ASCEND_API
-        CREATE(INFINI_DEVICE_ASCEND, ascend);
-#endif
-#ifdef ENABLE_METAX_GPU
-    case DevMetaxGpu: {
-        return macaCreateRoPEDescriptor((MacaHandle_t)handle,
-                                        (RoPEMacaDescriptor_t *)desc_ptr, t,
-                                        pos_ids, sin_table, cos_table);
-    }
-#endif
 #ifdef ENABLE_MTHREADS_GPU
    case DevMthreadsGpu: {
        return musaCreateRoPEDescriptor((MusaHandle_t)handle,
@@ -87,11 +83,14 @@ __C infiniStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc,
 #ifdef ENABLE_CPU_API
        GET(INFINI_DEVICE_CPU, cpu);
 #endif
-#ifdef ENABLE_CUDA_API
-        GET(INFINI_DEVICE_NVIDIA, cuda);
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
 #ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, maca);
+        GET(INFINI_DEVICE_METAX, metax);
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {
@@ -138,11 +137,14 @@ __C infiniStatus_t infiniopRoPE(
 #ifdef ENABLE_CPU_API
        CALCULATE(INFINI_DEVICE_CPU, cpu);
 #endif
-#ifdef ENABLE_CUDA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
 #ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, maca);
+        CALCULATE(INFINI_DEVICE_METAX, metax);
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {
@@ -184,11 +186,14 @@ infiniopDestroyRoPEDescriptor(infiniopRoPEDescriptor_t desc) {
 #ifdef ENABLE_CPU_API
        DELETE(INFINI_DEVICE_CPU, cpu);
 #endif
-#ifdef ENABLE_CUDA_API
-        DELETE(INFINI_DEVICE_NVIDIA, cuda);
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
 #ifdef ENABLE_METAX_API
-        DELETE(INFINI_DEVICE_METAX, maca);
+        DELETE(INFINI_DEVICE_METAX, metax);
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
    case DevCambriconMlu: {

--- a/src/infiniop/ops/sub/cpu/sub_cpu.cc
+++ b/src/infiniop/ops/sub/cpu/sub_cpu.cc
@@ -19,7 +19,7 @@ infiniStatus_t Descriptor::create(
    const auto &a_shape = a_desc->shape();
    const auto &b_shape = b_desc->shape();

-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);

    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);

@@ -43,6 +43,8 @@ infiniStatus_t Descriptor::calculate(
        return _device_info->calculate<SubOp, float>(_info, output, inputs, stream);
    case INFINI_DTYPE_F64:
        return _device_info->calculate<SubOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<SubOp, bf16_t>(_info, output, inputs, stream);
    default:
        return INFINI_STATUS_BAD_TENSOR_DTYPE;
    }

--- a/src/infiniop/ops/sub/cuda/sub_cuda_internal.cuh
+++ b/src/infiniop/ops/sub/cuda/sub_cuda_internal.cuh
 #ifndef __SUB_CUDA_H__
 #define __SUB_CUDA_H__

-#include "../../../elementwise/cuda/elementwise_cuda.cuh"
-#include <cuda_fp16.h>
-
 namespace op::sub::cuda {
 typedef struct SubOp {
 public:
    static constexpr size_t num_inputs = 2;
    template <typename T>
    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
-        if constexpr (std::is_same_v<T, half2>) {
+        if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, cuda_bfloat162>) {
            return __hsub2(a, b);
-        } else if constexpr (std::is_same_v<T, half>) {
+        } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
            return __hsub(a, b);
        } else if constexpr (std::is_same_v<T, float>) {
            return __fsub_rd(a, b);

--- a/src/infiniop/ops/sub/metax/sub_metax.h
+++ b/src/infiniop/ops/sub/metax/sub_metax.h
+#ifndef __SUB_METAX_API_H__
+#define __SUB_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(sub, metax)
+
+#endif // __SUB_METAX_API_H__