Merge pull request #1075 from InfiniTensor/RevertT_1-1-4

Revert T1-1-4

Merge pull request #1075 from InfiniTensor/RevertT_1-1-4
Revert T1-1-4
93191613 · thatPepe · GitHub · 6ab911c3 · def22a08 · 6ab911c3
Unverified Commit 93191613 authored Mar 13, 2026 by thatPepe Committed by GitHub Mar 13, 2026
20 changed files
--- a/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.mu
+++ b/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.mu
-#include "../../../devices/moore/moore_common.h"
-#include "cross_entropy_moore.h"
-
-#include <cub/block/block_reduce.cuh>
-#include "../../../devices/moore/moore_kernel_common.h"
-
-#include "../../../reduce/cuda/reduce.cuh"
-
-#include "cross_entropy_kernel.h"
-
-template <unsigned int BLOCK_SIZE, typename Tdata, typename Tidx, typename Tcompute>
-INFINIOP_MOORE_KERNEL crossEntropy(
-    Tdata *y, const Tdata *x, const void *target,
-    size_t outer_size, size_t vocab_size, ptrdiff_t x_stride) {
-    crossEntropyKernel<BLOCK_SIZE, Tdata, Tidx, Tcompute>(
-        y, x, target, outer_size, vocab_size, x_stride);
-}
-
-namespace op::cross_entropy::moore {
-
-struct Descriptor::Opaque {
-    std::shared_ptr<device::moore::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor() {
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc,
-    infiniopTensorDescriptor_t target_desc) {
-
-    (void)y_desc;
-
-    auto x_dtype = x_desc->dtype();
-    auto t_dtype = target_desc->dtype();
-
-    CHECK_DTYPE(x_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
-    CHECK_DTYPE(t_dtype, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
-
-    CrossEntropyInfo info{};
-    info.dtype = x_dtype;
-    info.target_dtype = t_dtype;
-    info.vocab_size = x_desc->shape().back();
-    info.outer_size = target_desc->numel();
-    info.x_stride = static_cast<ptrdiff_t>(info.vocab_size);
-
-    *desc_ptr = new Descriptor(
-        new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
-        info, 0, handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-template <unsigned int BLOCK_SIZE>
-infiniStatus_t launchKernel(void *y, const void *x, const void *target,
-                            const CrossEntropyInfo &info, musaStream_t stream) {
-    dim3 grid(static_cast<uint32_t>(info.outer_size), 1, 1);
-
-    if (info.target_dtype == INFINI_DTYPE_I64) {
-        if (info.dtype == INFINI_DTYPE_F16) {
-            crossEntropy<BLOCK_SIZE, half, int64_t, float>
-                <<<grid, BLOCK_SIZE, 0, stream>>>(
-                    (half *)y, (const half *)x, target,
-                    info.outer_size, info.vocab_size, info.x_stride);
-        } else if (info.dtype == INFINI_DTYPE_BF16) {
-            crossEntropy<BLOCK_SIZE, __mt_bfloat16, int64_t, float>
-                <<<grid, BLOCK_SIZE, 0, stream>>>(
-                    (__mt_bfloat16 *)y, (const __mt_bfloat16 *)x, target,
-                    info.outer_size, info.vocab_size, info.x_stride);
-        } else if (info.dtype == INFINI_DTYPE_F32) {
-            crossEntropy<BLOCK_SIZE, float, int64_t, float>
-                <<<grid, BLOCK_SIZE, 0, stream>>>(
-                    (float *)y, (const float *)x, target,
-                    info.outer_size, info.vocab_size, info.x_stride);
-        } else {
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;
-        }
-    } else if (info.target_dtype == INFINI_DTYPE_I32) {
-        if (info.dtype == INFINI_DTYPE_F16) {
-            crossEntropy<BLOCK_SIZE, half, int32_t, float>
-                <<<grid, BLOCK_SIZE, 0, stream>>>(
-                    (half *)y, (const half *)x, target,
-                    info.outer_size, info.vocab_size, info.x_stride);
-        } else if (info.dtype == INFINI_DTYPE_BF16) {
-            crossEntropy<BLOCK_SIZE, __mt_bfloat16, int32_t, float>
-                <<<grid, BLOCK_SIZE, 0, stream>>>(
-                    (__mt_bfloat16 *)y, (const __mt_bfloat16 *)x, target,
-                    info.outer_size, info.vocab_size, info.x_stride);
-        } else if (info.dtype == INFINI_DTYPE_F32) {
-            crossEntropy<BLOCK_SIZE, float, int32_t, float>
-                <<<grid, BLOCK_SIZE, 0, stream>>>(
-                    (float *)y, (const float *)x, target,
-                    info.outer_size, info.vocab_size, info.x_stride);
-        } else {
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;
-        }
-    } else {
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
-                                     void *y,
-                                     const void *x,
-                                     const void *target,
-                                     void *stream_) const {
-    musaStream_t stream = (musaStream_t)stream_;
-    (void)workspace;
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) {
-        CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_1024>(y, x, target, _info, stream));
-    } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) {
-        CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_512>(y, x, target, _info, stream));
-    } else {
-        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
-    }
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace op::cross_entropy::moore
--- a/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cu
+++ b/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cu
-#include "../../../devices/nvidia/nvidia_common.cuh"
-#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
-#include "../cuda/kernel.cuh"
-#include "cross_entropy_nvidia.cuh"
-
-template <unsigned int BLOCK_SIZE, typename Tdata, typename Tidx, typename Tcompute = float>
-INFINIOP_CUDA_KERNEL crossEntropy(
-    Tdata *y, const Tdata *x, const void *target,
-    size_t outer_size, size_t vocab_size, ptrdiff_t x_stride) {
-
-    crossEntropyKernel<BLOCK_SIZE, Tdata, Tidx, Tcompute>(
-        y, x, target, outer_size, vocab_size, x_stride);
-}
-
-namespace op::cross_entropy::nvidia {
-
-struct Descriptor::Opaque {
-    std::shared_ptr<device::nvidia::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor() {
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc,
-    infiniopTensorDescriptor_t target_desc) {
-
-    auto x_dtype = x_desc->dtype();
-    auto t_dtype = target_desc->dtype();
-
-    CrossEntropyInfo info;
-    info.dtype = x_dtype;
-    info.target_dtype = t_dtype;
-
-    info.vocab_size = x_desc->shape().back();
-    info.outer_size = target_desc->numel();
-    info.x_stride = static_cast<ptrdiff_t>(info.vocab_size);
-
-    auto internal = reinterpret_cast<device::nvidia::Handle *>(handle)->internal();
-
-    *desc_ptr = new Descriptor(
-        new Opaque{internal},
-        info, 0, handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-template <unsigned int BLOCK_SIZE>
-infiniStatus_t launchKernel(void *y, const void *x, const void *target,
-                            const CrossEntropyInfo &info, cudaStream_t stream) {
-
-    dim3 grid(static_cast<uint32_t>(info.outer_size), 1, 1);
-
-    if (info.target_dtype == INFINI_DTYPE_I64) {
-        if (info.dtype == INFINI_DTYPE_F16) {
-            crossEntropy<BLOCK_SIZE, half, int64_t>
-                <<<grid, BLOCK_SIZE, 0, stream>>>((half *)y, (const half *)x, target, info.outer_size, info.vocab_size, info.x_stride);
-        } else if (info.dtype == INFINI_DTYPE_BF16) {
-            crossEntropy<BLOCK_SIZE, __nv_bfloat16, int64_t>
-                <<<grid, BLOCK_SIZE, 0, stream>>>((__nv_bfloat16 *)y, (const __nv_bfloat16 *)x, target, info.outer_size, info.vocab_size, info.x_stride);
-        } else if (info.dtype == INFINI_DTYPE_F32) {
-            crossEntropy<BLOCK_SIZE, float, int64_t>
-                <<<grid, BLOCK_SIZE, 0, stream>>>((float *)y, (const float *)x, target, info.outer_size, info.vocab_size, info.x_stride);
-        }
-    } else if (info.target_dtype == INFINI_DTYPE_I32) {
-
-        if (info.dtype == INFINI_DTYPE_F16) {
-            crossEntropy<BLOCK_SIZE, half, int32_t>
-                <<<grid, BLOCK_SIZE, 0, stream>>>((half *)y, (const half *)x, target, info.outer_size, info.vocab_size, info.x_stride);
-        } else if (info.dtype == INFINI_DTYPE_BF16) {
-            crossEntropy<BLOCK_SIZE, __nv_bfloat16, int32_t>
-                <<<grid, BLOCK_SIZE, 0, stream>>>((__nv_bfloat16 *)y, (const __nv_bfloat16 *)x, target, info.outer_size, info.vocab_size, info.x_stride);
-        } else if (info.dtype == INFINI_DTYPE_F32) {
-            crossEntropy<BLOCK_SIZE, float, int32_t>
-                <<<grid, BLOCK_SIZE, 0, stream>>>((float *)y, (const float *)x, target, info.outer_size, info.vocab_size, info.x_stride);
-        }
-    } else {
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
-                                     void *y,
-                                     const void *x,
-                                     const void *target,
-                                     void *stream_) const {
-    cudaStream_t stream = (cudaStream_t)stream_;
-
-    int max_threads = _opaque->internal->maxThreadsPerBlock();
-
-    if (max_threads >= 1024) {
-        CHECK_STATUS(launchKernel<1024>(y, x, target, _info, stream));
-    } else if (max_threads >= 512) {
-        CHECK_STATUS(launchKernel<512>(y, x, target, _info, stream));
-    } else {
-        CHECK_STATUS(launchKernel<256>(y, x, target, _info, stream));
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace op::cross_entropy::nvidia
--- a/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cuh
+++ b/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cuh
-#ifndef __CROSS_ENTROPY_NVIDIA_H__
-#define __CROSS_ENTROPY_NVIDIA_H__
-
-#include "../cross_entropy.h"
-
-DESCRIPTOR(nvidia)
-
-#endif
--- a/src/infiniop/ops/cross_entropy/operator.cc
+++ b/src/infiniop/ops/cross_entropy/operator.cc
-#include "../../operator.h"
-#include "../../handle.h"
-#include "infiniop/ops/cross_entropy.h"
-
-#ifdef ENABLE_CPU_API
-#include "cpu/cross_entropy_cpu.h"
-#endif
-
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
-#include "nvidia/cross_entropy_nvidia.cuh"
-#endif
-
-#ifdef ENABLE_MOORE_API
-#include "moore/cross_entropy_moore.h"
-#endif
-#ifdef ENABLE_METAX_API
-#include "metax/cross_entropy_metax.h"
-#endif
-
-__INFINI_C infiniStatus_t infiniopCreateCrossEntropyDescriptor(
-    infiniopHandle_t handle,
-    infiniopCrossEntropyDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc,
-    infiniopTensorDescriptor_t target_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                                      \
-    case CASE:                                                                       \
-        return op::cross_entropy::NAMESPACE::Descriptor::create(                     \
-            handle,                                                                  \
-            reinterpret_cast<op::cross_entropy::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc, x_desc, target_desc);
-
-    switch (handle->device) {
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia)
-#endif
-#ifdef ENABLE_HYGON_API
-        CREATE(INFINI_DEVICE_HYGON, nvidia)
-#endif
-#ifdef ENABLE_MOORE_API
-        CREATE(INFINI_DEVICE_MOORE, moore)
-#endif
-#ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, metax)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef CREATE
-}
-
-__INFINI_C infiniStatus_t infiniopGetCrossEntropyWorkspaceSize(
-    infiniopCrossEntropyDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                         \
-    case CASE:                                                                                       \
-        *size = reinterpret_cast<op::cross_entropy::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-#ifdef ENABLE_HYGON_API
-        GET(INFINI_DEVICE_HYGON, nvidia)
-#endif
-#ifdef ENABLE_MOORE_API
-        GET(INFINI_DEVICE_MOORE, moore)
-#endif
-#ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, metax)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-}
-
-__INFINI_C infiniStatus_t infiniopCrossEntropy(
-    infiniopCrossEntropyDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    const void *target,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                                \
-    case CASE:                                                                    \
-        return reinterpret_cast<op::cross_entropy::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, x, target, stream);
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia)
-#endif
-#ifdef ENABLE_HYGON_API
-        CALCULATE(INFINI_DEVICE_HYGON, nvidia)
-#endif
-#ifdef ENABLE_MOORE_API
-        CALCULATE(INFINI_DEVICE_MOORE, moore)
-#endif
-#ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, metax)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef CALCULATE
-}
-
-__INFINI_C infiniStatus_t infiniopDestroyCrossEntropyDescriptor(
-    infiniopCrossEntropyDescriptor_t desc) {
-
-#define DESTROY(CASE, NAMESPACE)                                                   \
-    case CASE:                                                                     \
-        delete reinterpret_cast<op::cross_entropy::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        DESTROY(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DESTROY(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        DESTROY(INFINI_DEVICE_QY, nvidia)
-#endif
-#ifdef ENABLE_HYGON_API
-        DESTROY(INFINI_DEVICE_HYGON, nvidia)
-#endif
-#ifdef ENABLE_MOORE_API
-        DESTROY(INFINI_DEVICE_MOORE, moore)
-#endif
-#ifdef ENABLE_METAX_API
-        DESTROY(INFINI_DEVICE_METAX, metax)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef DESTROY
-}
--- a/src/infiniop/ops/equal/cpu/equal_cpu.cc
+++ b/src/infiniop/ops/equal/cpu/equal_cpu.cc
-#include <cstdint>
-#include <type_traits>
-
-#include "equal_cpu.h"
-
-namespace op::equal::cpu {
-
-Descriptor::~Descriptor() = default;
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-
-    const auto &a_desc = input_desc_vec.at(0);
-    const auto &b_desc = input_desc_vec.at(1);
-    auto compute_dtype = a_desc->dtype();
-    auto out_dtype = out_desc->dtype();
-
-    if (compute_dtype != b_desc->dtype()) {
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL);
-
-    CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64,
-                INFINI_DTYPE_BF16, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
-
-    const auto &c_shape = out_desc->shape();
-    const auto &a_shape = a_desc->shape();
-    const auto &b_shape = b_desc->shape();
-
-    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
-
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<EqualOp, bool, fp16_t, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<EqualOp, bool, float, float>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F64:
-        return _device_info->calculate<EqualOp, bool, double, double>(_info, output, inputs, stream);
-    case INFINI_DTYPE_BF16:
-        return _device_info->calculate<EqualOp, bool, bf16_t, bf16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_I32:
-        return _device_info->calculate<EqualOp, bool, int32_t, int32_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_I64:
-        return _device_info->calculate<EqualOp, bool, int64_t, int64_t>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-} // namespace op::equal::cpu
--- a/src/infiniop/ops/equal/cpu/equal_cpu.h
+++ b/src/infiniop/ops/equal/cpu/equal_cpu.h
-#ifndef __EQUAL_CPU_H__
-#define __EQUAL_CPU_H__
-
-#include <type_traits>
-
-#include "../../../elementwise/cpu/elementwise_cpu.h"
-
-ELEMENTWISE_DESCRIPTOR(equal, cpu)
-
-namespace op::equal::cpu {
-
-typedef struct EqualOp {
-public:
-    static constexpr size_t num_inputs = 2;
-
-    template <typename Tout, typename Tin0, typename Tin1>
-    bool operator()(const Tin0 &a, const Tin1 &b) {
-        if constexpr (std::is_same_v<Tin0, Tin1>) {
-            return a == b;
-        } else {
-            return false;
-        }
-    }
-} EqualOp;
-
-} // namespace op::equal::cpu
-
-#endif
--- a/src/infiniop/ops/equal/cuda/kernel.cuh
+++ b/src/infiniop/ops/equal/cuda/kernel.cuh
-#ifndef __EQUAL_CUDA_H__
-#define __EQUAL_CUDA_H__
-
-#if defined(__MACACC__)
-#include <maca_bfloat16.h>
-#include <maca_fp16.h>
-#else
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#endif
-#include <type_traits>
-
-namespace op::equal::cuda {
-
-typedef struct EqualOp {
-public:
-    static constexpr size_t num_inputs = 2;
-
-    template <typename Tout, typename Tin0, typename Tin1>
-    __device__ __forceinline__ bool operator()(const Tin0 &a, const Tin1 &b) const {
-        if constexpr (std::is_same_v<Tin0, Tin1>) {
-            if constexpr (std::is_same_v<Tin0, half2>) {
-                static_assert(!std::is_same_v<Tin0, half2>, "half2 is not supported for mixed output dtype");
-            } else if constexpr (std::is_same_v<Tin0, half>) {
-                return static_cast<Tout>(__heq(a, b));
-            } else {
-                return static_cast<Tout>(a == b);
-            }
-        } else {
-            return false;
-        }
-    }
-} EqualOp;
-
-} // namespace op::equal::cuda
-
-#endif
--- a/src/infiniop/ops/equal/metax/equal_metax.h
+++ b/src/infiniop/ops/equal/metax/equal_metax.h
-#ifndef __EQUAL_METAX_API_H__
-#define __EQUAL_METAX_API_H__
-
-#include "../../../elementwise/metax/elementwise_metax_api.h"
-
-ELEMENTWISE_DESCRIPTOR(equal, metax)
-
-#endif // __EQUAL_METAX_API_H__
--- a/src/infiniop/ops/equal/metax/equal_metax.maca
+++ b/src/infiniop/ops/equal/metax/equal_metax.maca
-#include "equal_metax.h"
-
-#include "../../../elementwise/metax/elementwise_metax.h"
-
-#include "../cuda/kernel.cuh"
-
-namespace op::equal::metax {
-
-Descriptor::~Descriptor() = default;
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
-
-    const auto &a_desc = input_desc_vec.at(0);
-    auto compute_dtype = a_desc->dtype();
-    auto out_dtype = out_desc->dtype();
-
-    const auto &b_desc = input_desc_vec.at(1);
-    const auto &c_shape = out_desc->shape();
-    const auto &a_shape = a_desc->shape();
-    const auto &b_shape = b_desc->shape();
-
-    CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16,
-                INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64);
-
-    CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL);
-
-    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
-
-    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::EqualOp, bool, half, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, cuda::EqualOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::EqualOp, bool, float, float>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_I32:
-        return _device_info->calculate<256, cuda::EqualOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_I64:
-        return _device_info->calculate<256, cuda::EqualOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F64:
-        return _device_info->calculate<256, cuda::EqualOp, bool, double, double>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-}
-
-} // namespace op::equal::metax
--- a/src/infiniop/ops/equal/moore/equal_moore.h
+++ b/src/infiniop/ops/equal/moore/equal_moore.h
-#ifndef __EQUAL_MOORE_API_H__
-#define __EQUAL_MOORE_API_H__
-
-#include "../../../elementwise/moore/elementwise_moore_api.h"
-
-ELEMENTWISE_DESCRIPTOR(equal, moore)
-
-#endif // __EQUAL_MOORE_API_H__
--- a/src/infiniop/ops/equal/moore/equal_moore.mu
+++ b/src/infiniop/ops/equal/moore/equal_moore.mu
-#include "equal_moore.h"
-
-#include "../../../elementwise/moore/elementwise_moore.h"
-
-#include "equal_moore_kernel.h"
-
-namespace op::equal::moore {
-namespace {
-
-inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) {
-    if (!info.isOutputContiguous()) {
-        return false;
-    }
-    const bool *input_contiguous = info.getInputContiguous();
-    const bool *input_broadcasted = info.getInputBroadcasted();
-    for (size_t i = 0; i < 2; ++i) {
-        if (!input_contiguous[i] || input_broadcasted[i]) {
-            return false;
-        }
-    }
-    return true;
-}
-
-template <typename Tout, typename Tin>
-INFINIOP_MOORE_KERNEL equal_contiguous_kernel(size_t numel, Tout *output, const Tin *a, const Tin *b) {
-    const auto op = op::equal::moore::EqualOp{};
-    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    size_t stride = blockDim.x * gridDim.x;
-    for (; idx < numel; idx += stride) {
-        output[idx] = op.template operator()<Tout, Tin>(a[idx], b[idx]);
-    }
-}
-
-template <typename Tout, typename Tin>
-infiniStatus_t launch_fast_path(size_t numel,
-                                void *output,
-                                const std::vector<const void *> &inputs,
-                                void *stream) {
-    if (numel == 0) {
-        return INFINI_STATUS_SUCCESS;
-    }
-
-    constexpr int kBlockSize = 256;
-    int grid = static_cast<int>((numel + kBlockSize - 1) / kBlockSize);
-    if (grid > 65535) {
-        grid = 65535;
-    }
-
-    auto musa_stream = reinterpret_cast<musaStream_t>(stream);
-    equal_contiguous_kernel<Tout, Tin><<<grid, kBlockSize, 0, musa_stream>>>(
-        numel,
-        reinterpret_cast<Tout *>(output),
-        reinterpret_cast<const Tin *>(inputs[0]),
-        reinterpret_cast<const Tin *>(inputs[1]));
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace
-
-Descriptor::~Descriptor() = default;
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
-
-    const auto &a_desc = input_desc_vec.at(0);
-    auto compute_dtype = a_desc->dtype();
-    auto out_dtype = out_desc->dtype();
-
-    const auto &b_desc = input_desc_vec.at(1);
-    const auto &c_shape = out_desc->shape();
-    const auto &a_shape = a_desc->shape();
-    const auto &b_shape = b_desc->shape();
-
-    CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16,
-                INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64);
-
-    CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL);
-
-    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
-
-    // create MOORE elementwise descriptor
-    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (can_use_contiguous_fast_path(_info)) {
-        size_t numel = _info.getOutputSize();
-        switch (_dtype) {
-        case INFINI_DTYPE_F16:
-            return launch_fast_path<bool, half>(numel, output, inputs, stream);
-        case INFINI_DTYPE_BF16:
-            return launch_fast_path<bool, cuda_bfloat16>(numel, output, inputs, stream);
-        case INFINI_DTYPE_F32:
-            return launch_fast_path<bool, float>(numel, output, inputs, stream);
-        case INFINI_DTYPE_I32:
-            return launch_fast_path<bool, int32_t>(numel, output, inputs, stream);
-        case INFINI_DTYPE_I64:
-            return launch_fast_path<bool, int64_t>(numel, output, inputs, stream);
-        case INFINI_DTYPE_F64:
-            return launch_fast_path<bool, double>(numel, output, inputs, stream);
-        default:
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;
-        }
-    }
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, moore::EqualOp, bool, half, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, moore::EqualOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, moore::EqualOp, bool, float, float>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_I32:
-        return _device_info->calculate<256, moore::EqualOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_I64:
-        return _device_info->calculate<256, moore::EqualOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F64:
-        return _device_info->calculate<256, moore::EqualOp, bool, double, double>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-}
-
-} // namespace op::equal::moore
--- a/src/infiniop/ops/equal/moore/equal_moore_kernel.h
+++ b/src/infiniop/ops/equal/moore/equal_moore_kernel.h
-#ifndef __EQUAL_MOORE_KERNEL_H__
-#define __EQUAL_MOORE_KERNEL_H__
-
-#include <type_traits>
-
-namespace op::equal::moore {
-
-typedef struct EqualOp {
-public:
-    static constexpr size_t num_inputs = 2;
-
-    template <typename Tout, typename Tin0, typename Tin1>
-    __device__ __forceinline__ bool operator()(const Tin0 &a, const Tin1 &b) const {
-        if constexpr (std::is_same_v<Tin0, Tin1>) {
-            if constexpr (std::is_same_v<Tin0, half>) {
-                return __half2float(a) == __half2float(b);
-            } else if constexpr (std::is_same_v<Tin0, cuda_bfloat16>) {
-                return __bfloat162float(a) == __bfloat162float(b);
-            } else {
-                return a == b;
-            }
-        } else {
-            return false;
-        }
-    }
-} EqualOp;
-
-} // namespace op::equal::moore
-
-#endif // __EQUAL_MOORE_KERNEL_H__
--- a/src/infiniop/ops/equal/nvidia/equal_nvidia.cu
+++ b/src/infiniop/ops/equal/nvidia/equal_nvidia.cu
-#include <algorithm>
-#include <cstdint>
-#include <type_traits>
-
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
-
-#include "../cuda/kernel.cuh"
-#include "equal_nvidia.cuh"
-
-namespace {
-
-template <typename Tout, typename Tin>
-INFINIOP_CUDA_KERNEL FastEqualKernel(size_t n, Tout *output, const Tin *a, const Tin *b) {
-    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    size_t stride = blockDim.x * gridDim.x;
-    op::equal::cuda::EqualOp op{};
-    for (; idx < n; idx += stride) {
-        output[idx] = op.template operator()<Tout, Tin>(a[idx], b[idx]);
-    }
-}
-
-template <typename Tout, typename Tin>
-infiniStatus_t launchFastEqualKernel(size_t numel,
-                                     void *output,
-                                     const std::vector<const void *> &inputs,
-                                     void *stream) {
-    if (numel == 0) {
-        return INFINI_STATUS_SUCCESS;
-    }
-    constexpr int block = 256;
-    int grid = static_cast<int>((numel + block - 1) / block);
-    grid = std::min(grid, 65535);
-    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
-    FastEqualKernel<Tout, Tin><<<grid, block, 0, cuda_stream>>>(
-        numel,
-        reinterpret_cast<Tout *>(output),
-        reinterpret_cast<const Tin *>(inputs[0]),
-        reinterpret_cast<const Tin *>(inputs[1]));
-    auto err = cudaGetLastError();
-    return err == cudaSuccess ? INFINI_STATUS_SUCCESS : INFINI_STATUS_INTERNAL_ERROR;
-}
-
-} // namespace
-
-namespace op::equal::nvidia {
-
-Descriptor::~Descriptor() = default;
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-
-    const auto &a_desc = input_desc_vec.at(0);
-    auto compute_dtype = a_desc->dtype();
-    auto out_dtype = out_desc->dtype();
-
-    const auto &b_desc = input_desc_vec.at(1);
-    const auto &c_shape = out_desc->shape();
-    const auto &a_shape = a_desc->shape();
-    const auto &b_shape = b_desc->shape();
-
-    CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16,
-                INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64);
-
-    CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_U8, INFINI_DTYPE_I8);
-
-    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
-
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    bool fast_path = _info.isOutputContiguous();
-    if (fast_path) {
-        const bool *input_contiguous = _info.getInputContiguous();
-        const bool *input_broadcasted = _info.getInputBroadcasted();
-        for (size_t i = 0; i < 2; ++i) {
-            fast_path &= input_contiguous[i] && !input_broadcasted[i];
-        }
-    }
-
-    if (fast_path) {
-        size_t numel = _info.getOutputSize();
-        switch (_dtype) {
-        case INFINI_DTYPE_F16:
-            return launchFastEqualKernel<bool, half>(numel, output, inputs, stream);
-        case INFINI_DTYPE_BF16:
-            return launchFastEqualKernel<bool, cuda_bfloat16>(numel, output, inputs, stream);
-        case INFINI_DTYPE_F32:
-            return launchFastEqualKernel<bool, float>(numel, output, inputs, stream);
-        case INFINI_DTYPE_I32:
-            return launchFastEqualKernel<bool, int32_t>(numel, output, inputs, stream);
-        case INFINI_DTYPE_I64:
-            return launchFastEqualKernel<bool, int64_t>(numel, output, inputs, stream);
-        case INFINI_DTYPE_F64:
-            return launchFastEqualKernel<bool, double>(numel, output, inputs, stream);
-        default:
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;
-        }
-    }
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::EqualOp, bool, half, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, cuda::EqualOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::EqualOp, bool, float, float>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_I32:
-        return _device_info->calculate<256, cuda::EqualOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_I64:
-        return _device_info->calculate<256, cuda::EqualOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F64:
-        return _device_info->calculate<256, cuda::EqualOp, bool, double, double>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-} // namespace op::equal::nvidia
--- a/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh
+++ b/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh
-#ifndef __EQUAL_CUDA_API_H__
-#define __EQUAL_CUDA_API_H__
-
-#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
-
-ELEMENTWISE_DESCRIPTOR(equal, nvidia)
-
-#endif
--- a/src/infiniop/ops/equal/operator.cc
+++ b/src/infiniop/ops/equal/operator.cc
-#include "../../operator.h"
-#include "../../handle.h"
-#include "infiniop/ops/equal.h"
-
-#ifdef ENABLE_CPU_API
-#include "cpu/equal_cpu.h"
-#endif
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
-#include "nvidia/equal_nvidia.cuh"
-#endif
-#ifdef ENABLE_METAX_API
-#include "metax/equal_metax.h"
-#endif
-#ifdef ENABLE_KUNLUN_API
-#include "kunlun/equal_kunlun.h"
-#endif
-#ifdef ENABLE_CAMBRICON_API
-#include "bang/equal_bang.h"
-#endif
-#ifdef ENABLE_MOORE_API
-#include "moore/equal_moore.h"
-#endif
-
-__INFINI_C infiniStatus_t infiniopCreateEqualDescriptor(
-    infiniopHandle_t handle,
-    infiniopEqualDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t c_desc,
-    infiniopTensorDescriptor_t a_desc,
-    infiniopTensorDescriptor_t b_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                              \
-    case CASE:                                                               \
-        return op::equal::NAMESPACE::Descriptor::create(                     \
-            handle,                                                          \
-            reinterpret_cast<op::equal::NAMESPACE::Descriptor **>(desc_ptr), \
-            c_desc,                                                          \
-            {a_desc, b_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        CREATE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        CREATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__INFINI_C infiniStatus_t infiniopGetEqualWorkspaceSize(infiniopEqualDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                 \
-    case CASE:                                                                               \
-        *size = reinterpret_cast<op::equal::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        GET(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        GET(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        GET(INFINI_DEVICE_MOORE, moore);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__INFINI_C infiniStatus_t infiniopEqual(
-    infiniopEqualDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *c,
-    const void *a,
-    const void *b,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                              \
-    case CASE:                                                                  \
-        return reinterpret_cast<const op::equal::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, c, {a, b}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        CALCULATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__INFINI_C infiniStatus_t
-infiniopDestroyEqualDescriptor(infiniopEqualDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                  \
-    case CASE:                                                                   \
-        delete reinterpret_cast<const op::equal::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        DELETE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        DELETE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        DELETE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
--- a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc
+++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc
-#include "hardswish_cpu.h"
-
-#include <cstddef>
-
-namespace op::hardswish::cpu {
-namespace {
-
-inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) {
-    return info.isOutputContiguous() && info.getInputSize() == 1 && info.getInputContiguous()[0] && !info.getInputBroadcasted()[0];
-}
-
-template <typename T>
-infiniStatus_t launch_contiguous_cpu(const op::elementwise::ElementwiseInfo &info,
-                                     void *output,
-                                     const std::vector<const void *> &inputs) {
-    const T *in = reinterpret_cast<const T *>(inputs[0]);
-    T *out = reinterpret_cast<T *>(output);
-    const ptrdiff_t size = static_cast<ptrdiff_t>(info.getOutputSize());
-
-#pragma omp parallel for if (size > 1024)
-    for (ptrdiff_t i = 0; i < size; ++i) {
-        out[i] = HardSwishOp{}(in[i]);
-    }
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace
-
-Descriptor::~Descriptor() = default;
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &input_desc = input_desc_vec.at(0);
-    const auto &output_shape = out_desc->shape();
-    const auto &input_shape = input_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
-
-    CHECK_SAME_SHAPE(output_shape, input_shape);
-
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    const bool fast_path = can_use_contiguous_fast_path(_info);
-    if (fast_path) {
-        switch (_dtype) {
-        case INFINI_DTYPE_BF16:
-            return launch_contiguous_cpu<bf16_t>(_info, output, inputs);
-        case INFINI_DTYPE_F16:
-            return launch_contiguous_cpu<fp16_t>(_info, output, inputs);
-        case INFINI_DTYPE_F32:
-            return launch_contiguous_cpu<float>(_info, output, inputs);
-        case INFINI_DTYPE_F64:
-            return launch_contiguous_cpu<double>(_info, output, inputs);
-        default:
-            break;
-        }
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_BF16:
-        return _device_info->calculate<HardSwishOp, bf16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<HardSwishOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<HardSwishOp, float>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F64:
-        return _device_info->calculate<HardSwishOp, double>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-} // namespace op::hardswish::cpu
--- a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h
+++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h
-#ifndef __HARDSWISH_CPU_H__
-#define __HARDSWISH_CPU_H__
-
-#include "../../../elementwise/cpu/elementwise_cpu.h"
-
-ELEMENTWISE_DESCRIPTOR(hardswish, cpu)
-
-#include <algorithm>
-#include <cmath>
-
-namespace op::hardswish::cpu {
-
-typedef struct HardSwishOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        const float x_f = utils::cast<float>(x);
-        const float clamped = std::min(std::max(x_f + 3.0f, 0.0f), 6.0f);
-        const float result = x_f * clamped * (1.0f / 6.0f);
-        return utils::cast<T>(result);
-    }
-} HardSwishOp;
-
-typedef struct HardSwishContiguousOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-
-        T three = static_cast<T>(3);
-        T zero = static_cast<T>(0);
-        T six = static_cast<T>(6);
-
-        T scale = static_cast<T>(0.16666667f);
-
-        T val = x + three;
-
-        val = std::max(zero, val);
-        val = std::min(six, val);
-
-        return x * val * scale;
-    }
-} HardSwishContiguousOp;
-
-} // namespace op::hardswish::cpu
-
-#endif
--- a/src/infiniop/ops/hardswish/cuda/kernel.cuh
+++ b/src/infiniop/ops/hardswish/cuda/kernel.cuh
-#ifndef __HARDSWISH_CUDA_H__
-#define __HARDSWISH_CUDA_H__
-
-#include <cmath>
-#if defined(__MACACC__)
-#include <maca_bfloat16.h>
-#include <maca_fp16.h>
-#else
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#endif
-
-namespace op::hardswish::cuda {
-
-typedef struct HardSwishOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-
-        if constexpr (std::is_same_v<T, half2>) {
-
-            const half2 three = __float2half2_rn(3.0f);
-            const half2 scale = __float2half2_rn(0.16666667f);
-
-            half2 val = __hadd2(x, three);
-
-#if defined(ENABLE_ILUVATAR_API)
-
-            float2 val_f = __half22float2(val);
-            val_f.x = fminf(fmaxf(val_f.x, 0.0f), 6.0f);
-            val_f.y = fminf(fmaxf(val_f.y, 0.0f), 6.0f);
-            val = __floats2half2_rn(val_f.x, val_f.y);
-#else
-
-            const half2 zero = __float2half2_rn(0.0f);
-            const half2 six = __float2half2_rn(6.0f);
-
-#if __CUDA_ARCH__ >= 800
-
-            val = __hmin2(__hmax2(val, zero), six);
-#else
-
-            val = __hmax2(val, zero);
-            val = __hmin2(val, six);
-#endif
-#endif
-
-            return __hmul2(__hmul2(x, val), scale);
-
-        }
-
-        else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-
-            const float x_f = __bfloat162float(x);
-
-            const float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f);
-            return __float2bfloat16(x_f * val * 0.16666667f);
-
-        }
-
-        else if constexpr (std::is_same_v<T, half>) {
-            const float x_f = __half2float(x);
-            const float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f);
-            return __float2half(x_f * val * 0.16666667f);
-
-        }
-
-        else if constexpr (std::is_same_v<T, float>) {
-
-            const float val = fminf(fmaxf(x + 3.0f, 0.0f), 6.0f);
-            return x * val * 0.16666667f;
-
-        }
-
-        else if constexpr (std::is_same_v<T, double>) {
-            const double val = fmin(fmax(x + 3.0, 0.0), 6.0);
-            return x * val * (1.0 / 6.0);
-        }
-    }
-} HardSwishOp;
-
-} // namespace op::hardswish::cuda
-
-#endif
--- a/src/infiniop/ops/hardswish/metax/hardswish_metax.h
+++ b/src/infiniop/ops/hardswish/metax/hardswish_metax.h
-#ifndef __HARDSWISH_METAX_API_H__
-#define __HARDSWISH_METAX_API_H__
-
-#include "../../../elementwise/metax/elementwise_metax_api.h"
-
-ELEMENTWISE_DESCRIPTOR(hardswish, metax)
-
-#endif // __HARDSWISH_METAX_API_H__
--- a/src/infiniop/ops/hardswish/metax/hardswish_metax.maca
+++ b/src/infiniop/ops/hardswish/metax/hardswish_metax.maca
-#include "hardswish_metax.h"
-
-#include "../../../elementwise/metax/elementwise_metax.h"
-
-#include "../cuda/kernel.cuh"
-
-namespace op::hardswish::metax {
-
-Descriptor::~Descriptor() = default;
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &input_desc = input_desc_vec.at(0);
-    const auto &output_shape = out_desc->shape();
-    const auto &input_shape = input_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
-
-    CHECK_SAME_SHAPE(output_shape, input_shape);
-
-    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, cuda::HardSwishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::HardSwishOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::HardSwishOp, float>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F64:
-        return _device_info->calculate<256, cuda::HardSwishOp, double>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-}
-
-} // namespace op::hardswish::metax