Merge pull request #1075 from InfiniTensor/RevertT_1-1-4

Revert T1-1-4

Merge pull request #1075 from InfiniTensor/RevertT_1-1-4
Revert T1-1-4
93191613 · thatPepe · GitHub · 6ab911c3 · def22a08 · 6ab911c3
Unverified Commit 93191613 authored Mar 13, 2026 by thatPepe Committed by GitHub Mar 13, 2026
20 changed files
--- a/src/infiniop/ops/var_mean/moore/var_mean_moore.mu
+++ b/src/infiniop/ops/var_mean/moore/var_mean_moore.mu
-#include "../../../devices/moore/moore_common.h"
-#include "../../../devices/moore/moore_kernel_common.h"
-#include "../cuda/kernel.cuh"
-#include "var_mean_moore.h"
-
-namespace op::var_mean::moore {
-struct Descriptor::Opaque {
-    std::shared_ptr<device::moore::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor() {
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t var_output_desc,
-    infiniopTensorDescriptor_t mean_output_desc,
-    infiniopTensorDescriptor_t input_desc,
-    size_t *dim,
-    size_t dim_size,
-    bool unbiased,
-    bool keepdim) {
-    auto result = VarMeanInfo::create(var_output_desc, input_desc, dim, dim_size, unbiased, keepdim);
-    CHECK_RESULT(result);
-    auto info = result.take();
-    size_t workspace_size = 0;
-    workspace_size += input_desc->ndim() * (sizeof(size_t) + sizeof(ptrdiff_t)); // permuted_input_shape + permuted_input_strides
-    *desc_ptr = new Descriptor(
-        new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
-        info, workspace_size, handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-namespace {
-bool IsNanOut(const VarMeanInfo &info) {
-    return (info.reduce_num == 0) || (info.reduce_num == 1 && info.unbiased_var == true);
-}
-template <size_t BLOCK_SIZE, typename Tdata, typename ComputeType>
-infiniStatus_t launchKernel(
-    const VarMeanInfo &info,
-    Tdata *var_output, Tdata *mean_output, const Tdata *input,
-    bool unbiased, bool keepdim,
-    musaStream_t stream, void *workspace, size_t workspace_size) {
-    size_t input_ndim = info.permuted_input_shape.size();
-    size_t output_ndim = info.output_shape.size();
-    size_t input_size = info.input_size;
-    size_t output_size = info.output_size;
-    size_t reduce_num = info.reduce_num;
-    unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
-    size_t workspace_offset = 0;
-    size_t *permuted_input_shape_musa = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += input_ndim * sizeof(size_t);
-
-    ptrdiff_t *permuted_input_strides_musa = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += input_ndim * sizeof(ptrdiff_t);
-
-    CHECK_MOORE(musaMemcpyAsync(permuted_input_shape_musa, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), musaMemcpyHostToDevice, stream));
-    CHECK_MOORE(musaMemcpyAsync(permuted_input_strides_musa, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream));
-    bool is_nan = IsNanOut(info);
-    if (info.reduce_num == input_size) { // scalar output
-        ComputeType *tmp_buffer;
-        constexpr size_t MAX_GRID_SIZE = 128;
-        size_t grid_size = std::min(MAX_GRID_SIZE,
-                                    (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
-        grid_size = std::max(1UL, grid_size);
-        CHECK_MOORE(musaMalloc(&tmp_buffer, grid_size * 3 * sizeof(ComputeType)));
-        ComputeVarScalarOut<Tdata, ComputeType><<<grid_size, BLOCK_SIZE, 0, stream>>>(
-            input, var_output, mean_output, tmp_buffer, input_size, input_ndim,
-            permuted_input_shape_musa, permuted_input_strides_musa, unbiased, is_nan);
-        CHECK_MOORE(musaFree(tmp_buffer));
-    } else {
-        size_t grid_size = std::min(256UL, (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
-        grid_size = std::max(1UL, grid_size);
-        ComputeVarMeanUsingWelfordWrapper<Tdata, ComputeType><<<grid_size, BLOCK_SIZE, 0, stream>>>(
-            input, var_output, mean_output, input_ndim, output_size, reduce_num,
-            permuted_input_shape_musa, permuted_input_strides_musa, unbiased, is_nan);
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *var_output,
-    void *mean_output,
-    const void *input,
-    bool unbiased,
-    bool keepdim,
-    void *stream_) const {
-
-    musaStream_t stream = (musaStream_t)stream_;
-
-#define CALCULATE_VAR_MEAN(BLOCK_SIZE, Tdata, ComputeType)               \
-    launchKernel<BLOCK_SIZE, Tdata, ComputeType>(                        \
-        _info,                                                           \
-        (Tdata *)var_output, (Tdata *)mean_output, (const Tdata *)input, \
-        unbiased, keepdim,                                               \
-        stream, workspace, workspace_size)
-
-#define CALCULATE_VAR_MEAN_WITH_BLOCK_SIZE(BLOCK_SIZE)                    \
-    {                                                                     \
-        if (_info.dtype == INFINI_DTYPE_BF16)                             \
-            return CALCULATE_VAR_MEAN(BLOCK_SIZE, __mt_bfloat16, double); \
-        else if (_info.dtype == INFINI_DTYPE_F16)                         \
-            return CALCULATE_VAR_MEAN(BLOCK_SIZE, half, double);          \
-        else if (_info.dtype == INFINI_DTYPE_F32)                         \
-            return CALCULATE_VAR_MEAN(BLOCK_SIZE, float, double);         \
-        else                                                              \
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;                        \
-    }
-
-    if (_opaque->internal->maxThreadsPerBlock() >= 256) {
-        CALCULATE_VAR_MEAN_WITH_BLOCK_SIZE(256)
-    } else {
-        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
-    }
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace op::var_mean::moore
--- a/src/infiniop/ops/var_mean/nvidia/var_mean_nvidia.cu
+++ b/src/infiniop/ops/var_mean/nvidia/var_mean_nvidia.cu
-#include "../../../devices/nvidia/nvidia_common.cuh"
-#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
-#include "../cuda/kernel.cuh"
-#include "var_mean_nvidia.cuh"
-
-namespace op::var_mean::nvidia {
-struct Descriptor::Opaque {
-    std::shared_ptr<device::nvidia::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor() {
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t var_output_desc,
-    infiniopTensorDescriptor_t mean_output_desc,
-    infiniopTensorDescriptor_t input_desc,
-    size_t *dim,
-    size_t dim_size,
-    bool unbiased,
-    bool keepdim) {
-    auto result = VarMeanInfo::create(var_output_desc, input_desc, dim, dim_size, unbiased, keepdim);
-    CHECK_RESULT(result);
-    auto info = result.take();
-    size_t workspace_size = 0;
-    workspace_size += input_desc->ndim() * (sizeof(size_t) + sizeof(ptrdiff_t)); // permuted_input_shape + permuted_input_strides
-    *desc_ptr = new Descriptor(
-        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
-        info, workspace_size, handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-namespace {
-bool IsNanOut(const VarMeanInfo &info) {
-    return (info.reduce_num == 0) || (info.reduce_num == 1 && info.unbiased_var == true);
-}
-template <size_t BLOCK_SIZE, typename Tdata, typename ComputeType>
-infiniStatus_t launchKernel(
-    const VarMeanInfo &info,
-    Tdata *var_output, Tdata *mean_output, const Tdata *input,
-    bool unbiased, bool keepdim,
-    cudaStream_t stream, void *workspace, size_t workspace_size) {
-    size_t input_ndim = info.permuted_input_shape.size();
-    size_t output_ndim = info.output_shape.size();
-    size_t input_size = info.input_size;
-    size_t output_size = info.output_size;
-    size_t reduce_num = info.reduce_num;
-    unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
-    size_t workspace_offset = 0;
-
-    size_t *permuted_input_shape_cuda = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += input_ndim * sizeof(size_t);
-
-    ptrdiff_t *permuted_input_strides_cuda = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
-    workspace_offset += input_ndim * sizeof(ptrdiff_t);
-
-    CHECK_CUDA(cudaMemcpyAsync(permuted_input_shape_cuda, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream));
-    CHECK_CUDA(cudaMemcpyAsync(permuted_input_strides_cuda, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream));
-    bool is_nan = IsNanOut(info);
-    if (info.reduce_num == input_size) { // scalar output
-        ComputeType *tmp_buffer;
-        constexpr size_t MAX_GRID_SIZE = 128;
-        size_t grid_size = std::min(MAX_GRID_SIZE,
-                                    (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
-        grid_size = std::max(1UL, grid_size);
-        CHECK_CUDA(cudaMalloc(&tmp_buffer, grid_size * 3 * sizeof(ComputeType)));
-        ComputeVarScalarOut<Tdata, ComputeType><<<grid_size, BLOCK_SIZE, 0, stream>>>(
-            input, var_output, mean_output, tmp_buffer, input_size, input_ndim,
-            permuted_input_shape_cuda, permuted_input_strides_cuda, unbiased, is_nan);
-        CHECK_CUDA(cudaFree(tmp_buffer));
-    } else {
-        size_t grid_size = std::min(256UL, (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
-        grid_size = std::max(1UL, grid_size);
-        ComputeVarMeanUsingWelfordWrapper<Tdata, ComputeType><<<grid_size, BLOCK_SIZE, 0, stream>>>(
-            input, var_output, mean_output, input_ndim, output_size, reduce_num,
-            permuted_input_shape_cuda, permuted_input_strides_cuda, unbiased, is_nan);
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *var_output,
-    void *mean_output,
-    const void *input,
-    bool unbiased,
-    bool keepdim,
-    void *stream_) const {
-
-    cudaStream_t stream = (cudaStream_t)stream_;
-
-#define CALCULATE_VAR_MEAN(BLOCK_SIZE, Tdata, ComputeType)               \
-    launchKernel<BLOCK_SIZE, Tdata, ComputeType>(                        \
-        _info,                                                           \
-        (Tdata *)var_output, (Tdata *)mean_output, (const Tdata *)input, \
-        unbiased, keepdim,                                               \
-        stream, workspace, workspace_size)
-
-#define CALCULATE_VAR_MEAN_WITH_BLOCK_SIZE(BLOCK_SIZE)                    \
-    {                                                                     \
-        if (_info.dtype == INFINI_DTYPE_BF16)                             \
-            return CALCULATE_VAR_MEAN(BLOCK_SIZE, __nv_bfloat16, double); \
-        else if (_info.dtype == INFINI_DTYPE_F16)                         \
-            return CALCULATE_VAR_MEAN(BLOCK_SIZE, half, double);          \
-        else if (_info.dtype == INFINI_DTYPE_F32)                         \
-            return CALCULATE_VAR_MEAN(BLOCK_SIZE, float, double);         \
-        else                                                              \
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;                        \
-    }
-
-    if (_opaque->internal->maxThreadsPerBlock() >= 256) {
-        CALCULATE_VAR_MEAN_WITH_BLOCK_SIZE(256)
-    } else {
-        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
-    }
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace op::var_mean::nvidia
--- a/src/infiniop/ops/var_mean/nvidia/var_mean_nvidia.cuh
+++ b/src/infiniop/ops/var_mean/nvidia/var_mean_nvidia.cuh
-#ifndef __VAR_MEAN_NVIDIA_H__
-#define __VAR_MEAN_NVIDIA_H__
-
-#include "../var_mean_desc.h"
-
-DESCRIPTOR(nvidia);
-
-#endif // __VAR_MEAN_NVIDIA_H__
--- a/src/infiniop/ops/var_mean/operator.cc
+++ b/src/infiniop/ops/var_mean/operator.cc
-#include "../../operator.h"
-#include "../../handle.h"
-#include "infiniop/ops/var_mean.h"
-#include <vector>
-
-#ifdef ENABLE_CPU_API
-#include "cpu/var_mean_cpu.h"
-#endif
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
-#include "nvidia/var_mean_nvidia.cuh"
-#endif
-#ifdef ENABLE_METAX_API
-#include "metax/var_mean_metax.h"
-#endif
-#ifdef ENABLE_KUNLUN_API
-#include "kunlun/var_mean_kunlun.h"
-#endif
-#ifdef ENABLE_MOORE_API
-#include "moore/var_mean_moore.h"
-#endif
-
-__INFINI_C infiniStatus_t infiniopCreateVarMeanDescriptor(
-    infiniopHandle_t handle,
-    infiniopVarMeanDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t var_output_desc,
-    infiniopTensorDescriptor_t mean_output_desc,
-    infiniopTensorDescriptor_t input_desc,
-    size_t *dim,
-    size_t dim_size,
-    bool unbiased,
-    bool keepdim) {
-
-#define CREATE(CASE, NAMESPACE)                                                 \
-    case CASE:                                                                  \
-        return op::var_mean::NAMESPACE::Descriptor::create(                     \
-            handle,                                                             \
-            reinterpret_cast<op::var_mean::NAMESPACE::Descriptor **>(desc_ptr), \
-            var_output_desc,                                                    \
-            mean_output_desc,                                                   \
-            input_desc,                                                         \
-            dim,                                                                \
-            dim_size,                                                           \
-            unbiased,                                                           \
-            keepdim)
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_MOORE_API
-        CREATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__INFINI_C infiniStatus_t infiniopGetVarMeanWorkspaceSize(infiniopVarMeanDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                    \
-    case CASE:                                                                                  \
-        *size = reinterpret_cast<op::var_mean::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        GET(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_MOORE_API
-        GET(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__INFINI_C infiniStatus_t infiniopVarMean(
-    infiniopVarMeanDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *var_output,
-    void *mean_output,
-    const void *input,
-    size_t *dim,
-    size_t dim_size,
-    bool unbiased,
-    bool keepdim,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                                 \
-    case CASE:                                                                     \
-        return reinterpret_cast<const op::var_mean::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, var_output, mean_output, input, unbiased, keepdim, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_MOORE_API
-        CALCULATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__INFINI_C infiniStatus_t
-infiniopDestroyVarMeanDescriptor(infiniopVarMeanDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                     \
-    case CASE:                                                                      \
-        delete reinterpret_cast<const op::var_mean::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        DELETE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_MOORE_API
-        DELETE(INFINI_DEVICE_MOORE, moore);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
--- a/src/infiniop/ops/var_mean/var_mean_desc.h
+++ b/src/infiniop/ops/var_mean/var_mean_desc.h
-#ifndef INFINIOP_VAR_MEAN_DESCRIPTOR_H_
-#define INFINIOP_VAR_MEAN_DESCRIPTOR_H_
-#include "../../../utils.h"
-#include "../../operator.h"
-#include "../../tensor.h"
-
-#include "info.h"
-
-#define DESCRIPTOR(NAMESPACE)                                    \
-                                                                 \
-    namespace op::var_mean::NAMESPACE {                          \
-    class Descriptor final : public InfiniopDescriptor {         \
-        struct Opaque;                                           \
-        Opaque *_opaque;                                         \
-        VarMeanInfo _info;                                       \
-        size_t _workspace_size;                                  \
-                                                                 \
-        Descriptor(                                              \
-            Opaque *opaque,                                      \
-            VarMeanInfo info,                                    \
-            size_t workspace_size,                               \
-            infiniDevice_t device_type,                          \
-            int device_id)                                       \
-            : InfiniopDescriptor{device_type, device_id},        \
-              _opaque(opaque),                                   \
-              _info(info),                                       \
-              _workspace_size(workspace_size) {}                 \
-                                                                 \
-    public:                                                      \
-        ~Descriptor();                                           \
-        size_t workspaceSize() const { return _workspace_size; } \
-                                                                 \
-        static infiniStatus_t create(                            \
-            infiniopHandle_t handle,                             \
-            Descriptor **desc_ptr,                               \
-            infiniopTensorDescriptor_t var_output_desc,          \
-            infiniopTensorDescriptor_t mean_output_desc,         \
-            infiniopTensorDescriptor_t input_desc,               \
-            size_t *dim,                                         \
-            size_t dim_size,                                     \
-            bool unbiased,                                       \
-            bool keepdim);                                       \
-                                                                 \
-        infiniStatus_t calculate(                                \
-            void *workspace, size_t workspace_size,              \
-            void *var_output,                                    \
-            void *mean_output,                                   \
-            const void *input,                                   \
-            bool unbiased,                                       \
-            bool keepdim,                                        \
-            void *stream) const;                                 \
-    };                                                           \
-    }
-
-#endif
--- a/src/utils/custom_types.h
+++ b/src/utils/custom_types.h
@@ -13,22 +13,6 @@ struct CustomBFloat16 {
 };
 typedef struct CustomBFloat16 bf16_t;

-inline bool operator==(const CustomFloat16 &lhs, const CustomFloat16 &rhs) {
-    return lhs._v == rhs._v;
-}
-
-inline bool operator!=(const CustomFloat16 &lhs, const CustomFloat16 &rhs) {
-    return !(lhs == rhs);
-}
-
-inline bool operator==(const CustomBFloat16 &lhs, const CustomBFloat16 &rhs) {
-    return lhs._v == rhs._v;
-}
-
-inline bool operator!=(const CustomBFloat16 &lhs, const CustomBFloat16 &rhs) {
-    return !(lhs == rhs);
-}
-
 float _f16_to_f32(fp16_t val);
 fp16_t _f32_to_f16(float val);


--- a/test/infinicore/ops/all.py
+++ b/test/infinicore/ops/all.py
@@ -56,7 +56,7 @@ def parse_test_cases():
    for data in _TEST_CASES_DATA:
        shape, strides, dim, keepdim, out_strides = data
        input_supports_inplace = not is_broadcast(strides)
-        # out_supports_inplace = not is_broadcast(out_strides)
+        out_supports_inplace = not is_broadcast(out_strides)

        for dtype in _TENSOR_DTYPES:
            tol = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 0})
@@ -81,19 +81,19 @@ def parse_test_cases():
            )

            # explicit out when supported (create out tensor with computed shape)
-            # out_shape = _compute_out_shape(shape, dim, keepdim)
-            # out_spec = TensorSpec.from_tensor(out_shape, out_strides, infinicore.bool)
-            # if out_supports_inplace:
-            #     test_cases.append(
-            #         TestCase(
-            #             inputs=[in_spec],
-            #             kwargs=kwargs,
-            #             output_spec=out_spec,
-            #             comparison_target="out",
-            #             tolerance=tol,
-            #             description="All - INPLACE(out)",
-            #         )
-            #     )
+            out_shape = _compute_out_shape(shape, dim, keepdim)
+            out_spec = TensorSpec.from_tensor(out_shape, out_strides, infinicore.bool)
+            if out_supports_inplace:
+                test_cases.append(
+                    TestCase(
+                        inputs=[in_spec],
+                        kwargs=kwargs,
+                        output_spec=out_spec,
+                        comparison_target="out",
+                        tolerance=tol,
+                        description="All - INPLACE(out)",
+                    )
+                )

    return test_cases

@@ -110,9 +110,9 @@ class OpTest(BaseOperatorTest):
    def torch_operator(self, *args, **kwargs):
        return torch.all(*args, **kwargs)

-    def infinicore_operator(self, *args, **kwargs):
-        """InfiniCore implementation (operator not yet available)."""
-        return infinicore.all(*args, **kwargs)
+    # def infinicore_operator(self, *args, **kwargs):
+    #     """InfiniCore implementation (operator not yet available)."""
+    #     return infinicore.all(*args, **kwargs)


 def main():

--- a/test/infinicore/ops/avg_pool1d.py
+++ b/test/infinicore/ops/avg_pool1d.py
@@ -74,8 +74,9 @@ class OpTest(BaseOperatorTest):
    def torch_operator(self, *args, **kwargs):
        return torch.nn.functional.avg_pool1d(*args, **kwargs)

-    def infinicore_operator(self, *args, **kwargs):
-        return infinicore.nn.functional.avg_pool1d(*args, **kwargs)
+    # def infinicore_operator(self, *args, **kwargs):
+    #     """InfiniCore implementation (operator not yet available)."""
+    #     return infinicore.nn.functional.avg_pool1d(*args, **kwargs)


 def main():

--- a/test/infinicore/ops/cross_entropy.py
+++ b/test/infinicore/ops/cross_entropy.py
@@ -11,8 +11,6 @@ from framework.tensor import TensorInitializer
 # Test cases format: (input_shape_logits_N_C, target_shape_N, input_strides_or_None, weight_present_bool, ignore_index_or_None)
 # infinicore.nn.functional.cross_entropy(input, target, weight=None, ignore_index=-100, reduction='mean')

-# CrossEntropy kernel当前只支持逐元素loss且不带class weight/ignore_index。
-# 仍然保留原始配置，后续实现这些特性时只需放开过滤条件即可。
 _TEST_CASES_DATA = [
    ((4, 5), (4,), None, False, None),
    ((8, 10), (8,), None, True, -1),
@@ -22,9 +20,6 @@ _TEST_CASES_DATA = [
    ((2, 2), (2,), None, True, -100),
 ]

-_SUPPORT_WEIGHT = False
-_SUPPORT_IGNORE_INDEX = False
-
 _TOLERANCE_MAP = {
    infinicore.float16: {"atol": 1e-3, "rtol": 1e-2},
    infinicore.float32: {"atol": 1e-5, "rtol": 1e-4},
@@ -45,11 +40,6 @@ def parse_test_cases():
    ) in _TEST_CASES_DATA:
        for dtype in _TENSOR_DTYPES:
            tol = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4})
-            if weight_present and not _SUPPORT_WEIGHT:
-                continue
-            if ignore_index is not None and not _SUPPORT_IGNORE_INDEX:
-                continue
-
            logits = TensorSpec.from_tensor(logits_shape, logits_strides, dtype)
            target = TensorSpec.from_tensor(
                target_shape,
@@ -61,7 +51,7 @@ def parse_test_cases():
            )

            inputs = [logits, target]
-            kwargs = {"reduction": "none"}
+            kwargs = {}
            if weight_present:
                weight_spec = TensorSpec.from_tensor((logits_shape[1],), None, dtype)
                inputs.append(weight_spec)
@@ -94,10 +84,9 @@ class OpTest(BaseOperatorTest):
    def torch_operator(self, *args, **kwargs):
        return torch.nn.functional.cross_entropy(*args, **kwargs)

-    def infinicore_operator(self, *args, **kwargs):
-        """InfiniCore implementation."""
-        out = kwargs.pop("out", None)
-        return infinicore.cross_entropy(*args, out=out, **kwargs)
+    # def infinicore_operator(self, *args, **kwargs):
+    #     """InfiniCore implementation (operator not yet available)."""
+    #     return infinicore.nn.functional.cross_entropy(*args, **kwargs)


 def main():

--- a/test/infinicore/ops/equal.py
+++ b/test/infinicore/ops/equal.py
@@ -74,11 +74,8 @@ def parse_test_cases():
                    )
                )

-            # Equal 结果为 bool，无法安全复用浮点/整型输入作为输出缓冲区。
-            # 只有当输入 dtype 本身为 bool 时才允许 inplace，这里提前留出开关。
-            allow_input_inplace = dtype == infinicore.bool
-
-            if allow_input_inplace and a_supports_inplace:
+            # in-place a
+            if a_supports_inplace:
                test_cases.append(
                    TestCase(
                        inputs=[a_spec, b_spec],
@@ -90,7 +87,8 @@ def parse_test_cases():
                    )
                )

-            if allow_input_inplace and b_supports_inplace:
+            # in-place b
+            if b_supports_inplace:
                test_cases.append(
                    TestCase(
                        inputs=[a_spec, b_spec],
@@ -117,8 +115,9 @@ class OpTest(BaseOperatorTest):
    def torch_operator(self, *args, **kwargs):
        return torch.eq(*args, **kwargs)

-    def infinicore_operator(self, *args, **kwargs):
-        return infinicore.equal(*args, **kwargs)
+    # def infinicore_operator(self, *args, **kwargs):
+    #     """InfiniCore implementation (operator not yet available)."""
+    #     return infinicore.eq(*args, **kwargs)


 def main():

--- a/test/infinicore/ops/hardswish.py
+++ b/test/infinicore/ops/hardswish.py
@@ -70,8 +70,9 @@ class OpTest(BaseOperatorTest):
    def torch_operator(self, *args, **kwargs):
        return torch.nn.functional.hardswish(*args, **kwargs)

-    def infinicore_operator(self, *args, **kwargs):
-        return infinicore.nn.functional.hardswish(*args, **kwargs)
+    # def infinicore_operator(self, *args, **kwargs):
+    #     """InfiniCore implementation (operator not yet available)."""
+    #     return infinicore.nn.functional.hardswish(*args, **kwargs)


 def main():

--- a/test/infinicore/ops/hardtanh.py
+++ b/test/infinicore/ops/hardtanh.py
@@ -17,6 +17,7 @@ from framework import (

 _TEST_CASES_DATA = [
    ((13, 4), None, -1.0, 1.0),
+    ((13, 4), (10, 1), -0.5, 0.5),
    ((8, 8, 8), None, -2.0, 2.0),
 ]

@@ -86,11 +87,9 @@ class OpTest(BaseOperatorTest):
    def torch_operator(self, *args, **kwargs):
        return torch.nn.functional.hardtanh(*args, **kwargs)

-    def infinicore_operator(self, *args, **kwargs):
-        """InfiniCore implementation."""
-        import infinicore.nn.functional as F
-
-        return F.hardtanh(*args, **kwargs)
+    # def infinicore_operator(self, *args, **kwargs):
+    #     """InfiniCore implementation (operator not yet available)."""
+    #     return infinicore.nn.functional.hardtanh(*args, **kwargs)


 def main():

--- a/test/infinicore/ops/sum.py
+++ b/test/infinicore/ops/sum.py
@@ -20,7 +20,7 @@ _TEST_CASES_DATA = [
    ((8, 8), None, None, None, None),
    ((8, 8), (16, 1), 1, False, None),
    ((2, 3, 4), None, 0, True, None),
-    ((1, 8), None, (0,), False, None),  # tuple 导致 infini_list  kwargs dim,[0]
+    ((1, 8), None, (0,), False, None),
    ((16, 64), (128, 1), None, None, None),
    ((4, 5, 6), (60, 12, 2), 2, True, None),
 ]
@@ -61,6 +61,7 @@ def parse_test_cases():
                    description="Sum - OUT_OF_PLACE",
                )
            )
+
    return test_cases


@@ -76,11 +77,9 @@ class OpTest(BaseOperatorTest):
    def torch_operator(self, *args, **kwargs):
        return torch.sum(*args, **kwargs)

-    def infinicore_operator(self, *args, **kwargs):
-        """InfiniCore implementation (operator not yet available)."""
-        return infinicore.sum(
-            *args, **kwargs
-        )  # todo 找到具体对应的 python/infinicore/ops/sum.py
+    # def infinicore_operator(self, *args, **kwargs):
+    #     """InfiniCore implementation (operator not yet available)."""
+    #     return infinicore.sum(*args, **kwargs)


 def main():

--- a/test/infinicore/ops/topk.py
+++ b/test/infinicore/ops/topk.py
@@ -15,7 +15,7 @@ from framework import (

 # Test cases format: (shape, input_strides, k, dim, largest, sorted)
 _TEST_CASES_DATA = [
-    ((6, 8), None, 1, 1, False, True),
+    ((6, 8), None, 1, 1, True, True),
    ((8, 4), (16, 1), 2, 0, True, False),
    ((5, 5), None, 3, -1, False, True),
    ((3, 7), (14, 1), 2, 1, True, True),
@@ -55,7 +55,6 @@ def parse_test_cases():
                    comparison_target=None,
                    tolerance=tol,
                    description=f"topk - OUT_OF_PLACE",
-                    output_count=2,
                )
            )

@@ -78,9 +77,9 @@ class OpTest(BaseOperatorTest):
    def torch_operator(self, *args, **kwargs):
        return torch.topk(*args, **kwargs)

-    def infinicore_operator(self, *args, **kwargs):
-        """InfiniCore implementation (operator not yet available)."""
-        return infinicore.topk(*args, **kwargs)
+    # def infinicore_operator(self, *args, **kwargs):
+    #     """InfiniCore implementation (operator not yet available)."""
+    #     return infinicore.topk(*args, **kwargs)


 def main():

--- a/test/infinicore/ops/var.py
+++ b/test/infinicore/ops/var.py
@@ -76,9 +76,9 @@ class OpTest(BaseOperatorTest):
    def torch_operator(self, *args, **kwargs):
        return torch.var(*args, **kwargs)

-    def infinicore_operator(self, *args, **kwargs):
-        """InfiniCore implementation (operator not yet available)."""
-        return infinicore.var(*args, **kwargs)
+    # def infinicore_operator(self, *args, **kwargs):
+    #     """InfiniCore implementation (operator not yet available)."""
+    #     return infinicore.var(*args, **kwargs)


 def main():

--- a/test/infinicore/ops/var_mean.py
+++ b/test/infinicore/ops/var_mean.py
@@ -15,7 +15,7 @@ from framework import (

 # Test cases format: (in_shape, in_strides_or_None, dim_or_None, unbiased_or_None, keepdim_or_None)
 # var_mean returns (var, mean)
-# Changed in torch version 2.0: Previously this argument was called unbiased and was a boolean with True corresponding to correction=1 and False being correction=0.
+
 _TEST_CASES_DATA = [
    ((8, 8), None, None, None, None),
    ((8, 8), (16, 1), 1, True, False),
@@ -27,7 +27,7 @@ _TEST_CASES_DATA = [

 _TOLERANCE_MAP = {
    infinicore.float16: {"atol": 1e-3, "rtol": 1e-2},
-    infinicore.float32: {"atol": 1e-5, "rtol": 1e-3},
+    infinicore.float32: {"atol": 1e-5, "rtol": 1e-4},
 }

 _TENSOR_DTYPES = [infinicore.float16, infinicore.float32]
@@ -47,8 +47,6 @@ def parse_test_cases():
                kwargs["dim"] = dim
            if unbiased is not None:
                kwargs["unbiased"] = unbiased
-                # Changed in version 2.0: Previously this argument was called unbiased and was a boolean with True
-                # corresponding to correction=1 and False being correction=0.
            if keepdim is not None:
                kwargs["keepdim"] = keepdim

@@ -78,9 +76,9 @@ class OpTest(BaseOperatorTest):
    def torch_operator(self, *args, **kwargs):
        return torch.var_mean(*args, **kwargs)

-    def infinicore_operator(self, *args, **kwargs):
-        """InfiniCore implementation (operator not yet available)."""
-        return infinicore.var_mean(*args, **kwargs)
+    # def infinicore_operator(self, *args, **kwargs):
+    #     """InfiniCore implementation (operator not yet available)."""
+    #     return infinicore.var_mean(*args, **kwargs)


 def main():

--- a/test/infiniop/avg_pool1d.py
+++ b/test/infiniop/avg_pool1d.py
-import ctypes
-from ctypes import c_uint64
-
-import torch
-
-from libinfiniop import (
-    LIBINFINIOP,
-    InfiniDeviceNames,
-    InfiniDtype,
-    InfiniDtypeNames,
-    TestTensor,
-    TestWorkspace,
-    check_error,
-    debug,
-    get_args,
-    get_test_devices,
-    get_tolerance,
-    infiniopOperatorDescriptor_t,
-    profile_operation,
-    test_operator,
-)
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-_TEST_CASES = [
-    # input_shape, x_stride, y_stride, kernel_size, stride, padding
-    ((2, 3, 16), None, None, 3, None, 0),
-    ((1, 4, 15), (60, 15, 1), (60, 15, 1), 5, 1, 2),
-    ((2, 1, 32), None, (32, 16, 1), 2, 2, 0),
-    ((3, 2, 7), (14, 7, 1), (9, 3, 1), 3, None, 1),
-    ((4, 6, 31), None, None, 4, 2, 1),
-    ((2, 8, 9), (72, 9, 1), (56, 7, 1), 3, 1, 0),
-]
-
-# Data types used for testing
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
-
-# Tolerance map for different data types
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
-    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-2},
-    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-4},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-def _effective_stride(stride, kernel_size):
-    if stride in (None, 0):
-        return kernel_size
-    return stride
-
-
-def _compute_output_shape(input_shape, kernel_size, stride, padding):
-    stride = _effective_stride(stride, kernel_size)
-    width = input_shape[2]
-    out_width = (width + 2 * padding - kernel_size) // stride + 1
-    return (input_shape[0], input_shape[1], out_width)
-
-
-def avg_pool1d_ref(x, kernel_size, stride, padding):
-    stride = _effective_stride(stride, kernel_size)
-    out = torch.nn.functional.avg_pool1d(
-        x.to(torch.float32), kernel_size=kernel_size, stride=stride, padding=padding
-    )
-    return out.to(x.dtype)
-
-
-def test(
-    handle,
-    device,
-    input_shape,
-    x_stride,
-    y_stride,
-    kernel_size,
-    stride,
-    padding,
-    dtype=InfiniDtype.F16,
-    sync=None,
-):
-    stride_value = _effective_stride(stride, kernel_size)
-    out_shape = _compute_output_shape(
-        input_shape, kernel_size, stride_value, padding
-    )
-    print(
-        f"Testing AvgPool1d on {InfiniDeviceNames[device]} with input_shape:{input_shape}, "
-        f"output_shape:{out_shape}, kernel_size:{kernel_size}, stride:{stride_value}, "
-        f"padding:{padding}, dtype:{InfiniDtypeNames[dtype]}"
-    )
-
-    x = TestTensor(input_shape, x_stride, dtype, device)
-    y = TestTensor(out_shape, y_stride, dtype, device, mode="zeros")
-
-    ans = avg_pool1d_ref(x.torch_tensor(), kernel_size, stride_value, padding)
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateAvgPool1dDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            y.descriptor,
-            x.descriptor,
-            kernel_size,
-            stride_value,
-            padding,
-        )
-    )
-
-    # Invalidate descriptors in tensors after creation to make sure kernels read from arguments
-    x.destroy_desc()
-    y.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetAvgPool1dWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, x.device)
-
-    def lib_avg_pool1d():
-        check_error(
-            LIBINFINIOP.infiniopAvgPool1d(
-                descriptor,
-                workspace.data(),
-                workspace.size(),
-                y.data(),
-                x.data(),
-                None,
-            )
-        )
-
-    lib_avg_pool1d()
-
-    if sync is not None:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
-    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
-
-    if PROFILE:
-        # fmt: off
-        profile_operation(
-            "PyTorch",
-            lambda: avg_pool1d_ref(x.torch_tensor(), kernel_size, stride_value, padding),
-            device,
-            NUM_PRERUN,
-            NUM_ITERATIONS,
-        )
-        profile_operation(
-            "    lib",
-            lambda: lib_avg_pool1d(),
-            device,
-            NUM_PRERUN,
-            NUM_ITERATIONS,
-        )
-        # fmt: on
-
-    check_error(LIBINFINIOP.infiniopDestroyAvgPool1dDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
-
--- a/test/infiniop/cross_entropy.py
+++ b/test/infiniop/cross_entropy.py
-import torch
-import ctypes
-from ctypes import c_uint64
-from libinfiniop import (
-    LIBINFINIOP,
-    TestTensor,
-    get_test_devices,
-    check_error,
-    test_operator,
-    get_args,
-    get_tolerance,
-    profile_operation,
-    TestWorkspace,
-    InfiniDtype,
-    InfiniDtypeNames,
-    InfiniDeviceNames,
-    infiniopOperatorDescriptor_t,
-)
-
-# ------------------------------------------------------------
-# 用例配置
-# ------------------------------------------------------------
-_TEST_CASES_ = [
-    ((2, 4, 10), None, None),        # logits shape, x_stride, y_stride
-    ((1, 128, 32000), None, None),
-    ((4, 512, 1000), None, None),
-]
-
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
-    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 2e-2},
-    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
-}
-
-# ------------------------------------------------------------
-# PyTorch 参考实现
-# ------------------------------------------------------------
-def cross_entropy_ref(logits, target):
-    vocab = logits.shape[-1]
-    logits_flat = logits.reshape(-1, vocab).float()
-    target_flat = target.reshape(-1).long()
-    loss = torch.nn.functional.cross_entropy(logits_flat, target_flat, reduction="none")
-    return loss.view(target.shape).to(logits.dtype)
-
-
-def test(handle, device, shape, x_stride=None, y_stride=None, dtype=InfiniDtype.F16, sync=None):
-    logits_shape = shape
-    label_shape = shape[:-1]
-    vocab = shape[-1]
-
-    print(f"Testing CrossEntropy on {InfiniDeviceNames[device]} logits:{logits_shape} dtype:{InfiniDtypeNames[dtype]}")
-
-    x = TestTensor(logits_shape, x_stride, dtype, device)
-    target = TestTensor(label_shape, None, InfiniDtype.I64, device)
-
-    # 生成有效标签
-    tgt = target.torch_tensor()
-    tgt.copy_(torch.randint(0, vocab, label_shape, dtype=torch.int64, device=tgt.device))
-    target.actual_tensor().copy_(tgt)
-
-    reference = cross_entropy_ref(x.torch_tensor(), target.torch_tensor())
-    y = TestTensor(label_shape, y_stride, dtype, device)
-
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateCrossEntropyDescriptor(
-            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor, target.descriptor
-        )
-    )
-
-    for tensor in [x, y, target]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(LIBINFINIOP.infiniopGetCrossEntropyWorkspaceSize(descriptor, ctypes.byref(workspace_size)))
-    workspace = TestWorkspace(workspace_size.value, x.device)
-
-    def run():
-        check_error(
-            LIBINFINIOP.infiniopCrossEntropy(
-                descriptor,
-                workspace.data(),
-                workspace.size(),
-                y.data(),
-                x.data(),
-                target.data(),
-                None,
-            )
-        )
-
-    run()
-    if sync:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    assert torch.allclose(y.actual_tensor(), reference, atol=atol, rtol=rtol)
-
-    check_error(LIBINFINIOP.infiniopDestroyCrossEntropyDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES_, _TENSOR_DTYPES)
-    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/equal.py
+++ b/test/infiniop/equal.py
-import torch
-import ctypes
-from ctypes import c_uint64
-from libinfiniop import (
-    LIBINFINIOP,
-    TestTensor,
-    get_test_devices,
-    check_error,
-    test_operator,
-    get_args,
-    debug,
-    get_tolerance,
-    profile_operation,
-    TestWorkspace,
-    InfiniDtype,
-    InfiniDtypeNames,
-    InfiniDeviceNames,
-    infiniopOperatorDescriptor_t,
-)
-from enum import Enum, auto
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-_TEST_CASES_ = [
-    # shape, a_stride, b_stride, c_stride
-    ((13, 4), None, None, None),
-    ((13, 4), (10, 1), (10, 1), (10, 1)),
-    ((13, 4), (0, 1), None, None),
-    ((13, 4, 4), None, None, None),
-    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
-    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
-    ((16, 5632), None, None, None),
-    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
-    ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)),
-    ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)),
-    ((4, 4, 5632), None, None, None),
-    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
-]
-
-# Equal 算子通常不支持 Inplace (输入Float vs 输出Bool，内存大小不同)
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-]
-
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-# 测试的输入数据类型
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.I32, InfiniDtype.I64]
-
-# 容差设置 (对于 Bool 比较，通常要求完全匹配)
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 0, "rtol": 0},
-    InfiniDtype.F32: {"atol": 0, "rtol": 0},
-    InfiniDtype.BF16: {"atol": 0, "rtol": 0},
-    InfiniDtype.I32: {"atol": 0, "rtol": 0},
-    InfiniDtype.I64: {"atol": 0, "rtol": 0},
-    InfiniDtype.BOOL: {"atol": 0, "rtol": 0},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-# PyTorch 标准实现
-def equal_func(c, a, b):
-    torch.eq(a, b, out=c)
-
-def test(
-    handle,
-    device,
-    shape,
-    a_stride=None,
-    b_stride=None,
-    c_stride=None,
-    inplace=Inplace.OUT_OF_PLACE,
-    dtype=torch.float16,
-    sync=None,
-):
-    # 输入 Tensor 使用指定的 dtype (如 float16)
-    a = TestTensor(shape, a_stride, dtype, device)
-    b = TestTensor(shape, b_stride, dtype, device)
-    
-    # [关键修改] 输出 Tensor 强制使用 Bool 类型
-    # 注意：这里 c_stride 如果是按字节计算的，对于 Bool 类型通常是 1 byte
-    c = TestTensor(shape, c_stride, InfiniDtype.BOOL, device)
-
-    if c.is_broadcast():
-        return
-
-    print(
-        f"Testing Equal on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
-        f"input_dtype:{InfiniDtypeNames[dtype]} output_dtype:BOOL"
-    )
-
-    # 运行 PyTorch 对照组
-    equal_func(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    
-    # [关键修改] 调用 Equal 的 Create 函数
-    check_error(
-        LIBINFINIOP.infiniopCreateEqualDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            c.descriptor, # Output (Bool)
-            a.descriptor, # Input A
-            b.descriptor, # Input B
-        )
-    )
-
-    # Invalidate descriptors
-    for tensor in [a, b, c]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetEqualWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, c.device)
-
-    def lib_equal():
-        check_error(
-            LIBINFINIOP.infiniopEqual(
-                descriptor,
-                workspace.data(),
-                workspace.size(),
-                c.data(),
-                a.data(),
-                b.data(),
-                None,
-            )
-        )
-
-    lib_equal()
-
-    # 使用 Bool 类型的容差 (实际上就是全等)
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, InfiniDtype.BOOL)
-    
-    if DEBUG:
-        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
-    
-    # 验证结果
-    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: equal_func(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_equal(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-        
-    check_error(LIBINFINIOP.infiniopDestroyEqualDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/hardswish.py
+++ b/test/infiniop/hardswish.py
-import torch
-import ctypes
-from ctypes import c_uint64
-from libinfiniop import (
-    LIBINFINIOP,
-    TestTensor,
-    get_test_devices,
-    check_error,
-    test_operator,
-    get_args,
-    debug,
-    get_tolerance,
-    profile_operation,
-    TestWorkspace,
-    InfiniDtype,
-    InfiniDtypeNames,
-    InfiniDeviceNames,
-    infiniopOperatorDescriptor_t,
-)
-from enum import Enum, auto
-
-# ==============================================================================
-#  Configuration (Internal Use Only)
-# ==============================================================================
-# 复用相同的测试用例配置，因为 HardSwish 也是逐元素操作
-_TEST_CASES_ = [
-    # shape, input_stride, output_stride
-    ((13, 4), None, None),
-    ((13, 4), (10, 1), (10, 1)),
-    ((13, 4), (0, 1), None),
-    ((13, 4, 4), None, None),
-    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
-    ((13, 4, 4), (4, 0, 1), None),
-    ((16, 5632), None, None),
-    ((16, 5632), (13312, 1), (13312, 1)),
-    ((4, 4, 5632), None, None),
-    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
-]
-
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE = auto()
-
-
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-    Inplace.INPLACE,
-]
-
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32]
-
-_TOLERANCE_MAP = {
-    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
-    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
-    InfiniDtype.F64: {"atol": 2.22e-15, "rtol": 2.22e-15},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-
-def test(
-    handle,
-    device,
-    shape,
-    input_stride=None,
-    output_stride=None,
-    inplace=Inplace.OUT_OF_PLACE,
-    dtype=torch.float16,
-    sync=None,
-):
-    input = TestTensor(shape, input_stride, dtype, device)
-    if inplace == Inplace.INPLACE:
-        if input_stride != output_stride:
-            return
-        output = input
-    else:
-        output = TestTensor(shape, output_stride, dtype, device, mode="ones")
-
-    if output.is_broadcast():
-        return
-
-    print(
-        f"Testing HardSwish on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride}"
-        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
-    )
-
-    new_output = torch.nn.functional.hardswish(input.torch_tensor())
-    output.update_torch_tensor(new_output)
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    
-    check_error(
-        LIBINFINIOP.infiniopCreateHardSwishDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            output.descriptor,
-            input.descriptor,
-        )
-    )
-
-    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
-    for tensor in [input, output]:
-        tensor.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetHardSwishWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, output.device)
-
-    def lib_hardswish():
-        check_error(
-            LIBINFINIOP.infiniopHardSwish(
-                descriptor,
-                workspace.data(),
-                workspace.size(),
-                output.data(),
-                input.data(),
-                None,
-            )
-        )
-
-    lib_hardswish()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
-    
-    assert torch.allclose(
-        output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol
-    )
-
-    # Profiling workflow
-    if PROFILE:
-        # fmt: off
-        profile_operation("PyTorch", lambda: torch.nn.functional.hardswish(input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_hardswish(), device, NUM_PRERUN, NUM_ITERATIONS)
-        # fmt: on
-    
-    check_error(LIBINFINIOP.infiniopDestroyHardSwishDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    # Configure testing options
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")