Revert "Merge pull request #1069 from InfiniTensor/issue/1031_T1_1_15"

This reverts commit 21c6af2d, reversing changes made to 99a802dd.

Revert "Merge pull request #1069 from InfiniTensor/issue/1031_T1_1_15"
This reverts commit 21c6af2d, reversing changes made to 99a802dd.
18773b69 · wooway777 · bfead271 · bfead271 · 18773b69 · 18773b69
Commit 18773b69 authored Mar 13, 2026 by wooway777
12 changed files
--- a/src/infiniop/ops/reciprocal/operator.cc
+++ b/src/infiniop/ops/reciprocal/operator.cc
-#include "../../operator.h"
-#include "../../handle.h"
-#include "infiniop/ops/reciprocal.h"
-
-#ifdef ENABLE_CPU_API
-#include "cpu/reciprocal_cpu.h"
-#endif
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
-#include "nvidia/reciprocal_nvidia.cuh"
-#endif
-#ifdef ENABLE_METAX_API
-#include "metax/reciprocal_metax.h"
-#endif
-#ifdef ENABLE_KUNLUN_API
-#include "kunlun/reciprocal_kunlun.h"
-#endif
-#ifdef ENABLE_CAMBRICON_API
-#include "bang/reciprocal_bang.h"
-#endif
-#ifdef ENABLE_MOORE_API
-#include "moore/reciprocal_moore.h"
-#endif
-
-__INFINI_C infiniStatus_t infiniopCreateReciprocalDescriptor(
-    infiniopHandle_t handle,
-    infiniopReciprocalDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                                   \
-    case CASE:                                                                    \
-        return op::reciprocal::NAMESPACE::Descriptor::create(                     \
-            handle,                                                               \
-            reinterpret_cast<op::reciprocal::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                               \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        CREATE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        CREATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__INFINI_C infiniStatus_t infiniopGetReciprocalWorkspaceSize(infiniopReciprocalDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                      \
-    case CASE:                                                                                    \
-        *size = reinterpret_cast<op::reciprocal::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        GET(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        GET(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        GET(INFINI_DEVICE_MOORE, moore);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__INFINI_C infiniStatus_t infiniopReciprocal(
-    infiniopReciprocalDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                                   \
-    case CASE:                                                                       \
-        return reinterpret_cast<const op::reciprocal::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        CALCULATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__INFINI_C infiniStatus_t
-infiniopDestroyReciprocalDescriptor(infiniopReciprocalDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                       \
-    case CASE:                                                                        \
-        delete reinterpret_cast<const op::reciprocal::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        DELETE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        DELETE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        DELETE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
--- a/test/infinicore/ops/addcmul.py
+++ b/test/infinicore/ops/addcmul.py
@@ -119,9 +119,9 @@ class OpTest(BaseOperatorTest):
    def torch_operator(self, *args, **kwargs):
        return torch.addcmul(*args, **kwargs)

-    def infinicore_operator(self, *args, **kwargs):
-        """InfiniCore implementation (operator not yet available)."""
-        return infinicore.addcmul(*args, **kwargs)
+    # def infinicore_operator(self, *args, **kwargs):
+    #     """InfiniCore implementation (operator not yet available)."""
+    #     return infinicore.addcmul(*args, **kwargs)


 def main():

--- a/test/infinicore/ops/atanh.py
+++ b/test/infinicore/ops/atanh.py
@@ -97,9 +97,9 @@ class OpTest(BaseOperatorTest):
    def torch_operator(self, *args, **kwargs):
        return torch.atanh(*args, **kwargs)

-    def infinicore_operator(self, *args, **kwargs):
-        """InfiniCore atanh implementation"""
-        return infinicore.atanh(*args, **kwargs)
+    # def infinicore_operator(self, *args, **kwargs):
+    #     """InfiniCore implementation (operator not yet available)."""
+    #     return infinicore.atanh(*args, **kwargs)


 def main():

--- a/test/infinicore/ops/binary_cross_entropy_with_logits.py
+++ b/test/infinicore/ops/binary_cross_entropy_with_logits.py
@@ -80,9 +80,9 @@ class OpTest(BaseOperatorTest):
    def torch_operator(self, *args, **kwargs):
        return torch.nn.functional.binary_cross_entropy_with_logits(*args, **kwargs)

-    def infinicore_operator(self, *args, **kwargs):
-        """InfiniCore implementation (operator not yet available)."""
-        return infinicore.nn.functional.binary_cross_entropy_with_logits(*args, **kwargs)
+    # def infinicore_operator(self, *args, **kwargs):
+    #     """InfiniCore implementation (operator not yet available)."""
+    #     return infinicore.nn.functional.binary_cross_entropy_with_logits(*args, **kwargs)


 def main():

--- a/test/infinicore/ops/cdist.py
+++ b/test/infinicore/ops/cdist.py
@@ -63,9 +63,9 @@ class OpTest(BaseOperatorTest):
    def torch_operator(self, *args, **kwargs):
        return torch.cdist(*args, **kwargs)

-    def infinicore_operator(self, *args, **kwargs):
-        """InfiniCore implementation (operator not yet available)."""
-        return infinicore.cdist(*args, **kwargs)
+    # def infinicore_operator(self, *args, **kwargs):
+    #     """InfiniCore implementation (operator not yet available)."""
+    #     return infinicore.cdist(*args, **kwargs)


 def main():

--- a/test/infinicore/ops/reciprocal.py
+++ b/test/infinicore/ops/reciprocal.py
@@ -89,9 +89,9 @@ class OpTest(BaseOperatorTest):
    def torch_operator(self, *args, **kwargs):
        return torch.reciprocal(*args, **kwargs)

-    def infinicore_operator(self, *args, **kwargs):
-        """InfiniCore implementation (operator not yet available)."""
-        return infinicore.reciprocal(*args, **kwargs)
+    # def infinicore_operator(self, *args, **kwargs):
+    #     """InfiniCore implementation (operator not yet available)."""
+    #     return infinicore.reciprocal(*args, **kwargs)


 def main():

--- a/test/infiniop/addcmul.py
+++ b/test/infiniop/addcmul.py
-import torch
-import ctypes
-from ctypes import c_uint64, c_float
-from libinfiniop import (
-    LIBINFINIOP,
-    TestTensor,
-    get_test_devices,
-    check_error,
-    test_operator,
-    get_args,
-    debug,
-    get_tolerance,
-    profile_operation,
-    TestWorkspace,
-    InfiniDtype,
-    InfiniDtypeNames,
-    InfiniDeviceNames,
-    infiniopOperatorDescriptor_t,
-)
-from enum import Enum, auto
-
-# ==============================================================================
-# Configuration
-# ==============================================================================
-_TEST_CASES_ = [
-    # shape, input_stride, t1_stride, t2_stride
-    ((3, 3), None, None, None),
-    ((32, 512), None, None, None),
-    ((32, 512), (1024, 1), (1024, 1), (1024, 1)),
-    ((16, 32, 64), None, None, None),
-    ((8, 1, 1024), None, None, None), # 包含广播形状的潜在测试
-]
-
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
-
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
-    InfiniDtype.BF16: {"atol": 5e-3, "rtol": 5e-2},
-    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
-}
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE_INPUT = auto()
-
-_INPLACE = [Inplace.OUT_OF_PLACE, Inplace.INPLACE_INPUT]
-_VALUES = [1.0, 0.5, -2.0] # 测试不同的 value 系数
-
-_TEST_CASES = [
-    test_case + (inplace_item, value)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-    for value in _VALUES
-]
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 100
-
-def test(
-    handle,
-    device,
-    shape,
-    input_stride=None,
-    t1_stride=None,
-    t2_stride=None,
-    inplace=Inplace.OUT_OF_PLACE,
-    value=1.0,
-    dtype=InfiniDtype.F16,
-    sync=None,
-):
-    print(
-        f"Testing Addcmul on {InfiniDeviceNames[device]} with shape:{shape} value:{value} dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
-    )
-
-    # 准备输入 Tensor
-    input_tensor = TestTensor(shape, input_stride, dtype, device)
-    t1 = TestTensor(shape, t1_stride, dtype, device)
-    t2 = TestTensor(shape, t2_stride, dtype, device)
-
-    # 使用 PyTorch 计算参考答案
-    # out = input + value * t1 * t2
-    ans = torch.addcmul(input_tensor.torch_tensor(), t1.torch_tensor(), t2.torch_tensor(), value=value)
-
-    if inplace == Inplace.INPLACE_INPUT:
-        out = input_tensor
-    else:
-        out = TestTensor(shape, None, dtype, device)
-
-    if sync is not None:
-        sync()
-
-    descriptor = infiniopOperatorDescriptor_t()
-    # 注意：根据之前的定义，Create 接口接收 value
-    check_error(
-        LIBINFINIOP.infiniopCreateAddcmulDescriptor(
-            handle, 
-            ctypes.byref(descriptor), 
-            out.descriptor, 
-            input_tensor.descriptor, 
-            t1.descriptor, 
-            t2.descriptor, 
-            c_float(value)
-        )
-    )
-
-    # 销毁临时描述符以防内核错误引用
-    for t in [input_tensor, t1, t2, out]:
-        t.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetAddcmulWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, input_tensor.device)
-
-    def lib_addcmul():
-        check_error(
-            LIBINFINIOP.infiniopAddcmul(
-                descriptor,
-                workspace.data(),
-                workspace_size.value,
-                out.data(),
-                input_tensor.data(),
-                t1.data(),
-                t2.data(),
-                None,
-            )
-        )
-
-    lib_addcmul()
-
-    if sync is not None:
-        sync()
-
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(out.actual_tensor(), ans, atol=atol, rtol=rtol)
-    
-    assert torch.allclose(out.actual_tensor(), ans, atol=atol, rtol=rtol)
-
-    if PROFILE:
-        profile_operation("PyTorch", lambda: torch.addcmul(input_tensor.torch_tensor(), t1.torch_tensor(), t2.torch_tensor(), value=value), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("    lib", lambda: lib_addcmul(), device, NUM_PRERUN, NUM_ITERATIONS)
-
-    check_error(LIBINFINIOP.infiniopDestroyAddcmulDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mAddcmul tests passed!\033[0m")
--- a/test/infiniop/atanh.py
+++ b/test/infiniop/atanh.py
-import torch
-import ctypes
-from ctypes import c_uint64
-from libinfiniop import (
-    LIBINFINIOP,
-    TestTensor,
-    get_test_devices,
-    check_error,
-    test_operator,
-    get_args,
-    debug,
-    get_tolerance,
-    profile_operation,
-    TestWorkspace,
-    InfiniDtype,
-    InfiniDtypeNames,
-    InfiniDeviceNames,
-    infiniopOperatorDescriptor_t,
-)
-from enum import Enum, auto
-
-# ==============================================================================
-# Configuration
-# ==============================================================================
-_TEST_CASES_ = [
-    # shape, a_stride, y_stride
-    ((13, 4), None, None),
-    ((13, 4), (10, 1), (10, 1)),
-    ((13, 4, 4), None, None),
-    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
-    ((16, 5632), None, None),
-    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
-]
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE_A = auto()
-
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-    Inplace.INPLACE_A,
-]
-
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-# atanh typically supports floating point types
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
-
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
-    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
-    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-def atanh_torch(y, a):
-    torch.atanh(a, out=y)
-
-def test(
-    handle,
-    device,
-    shape,
-    a_stride=None,
-    y_stride=None,
-    inplace=Inplace.OUT_OF_PLACE,
-    dtype=torch.float16,
-    sync=None,
-):
-    # Initialize input tensor
-    a = TestTensor(shape, a_stride, dtype, device)
-    
-    # Crucial: clamp values to (-1, 1) to avoid NaN/Inf for atanh
-    with torch.no_grad():
-        a.torch_tensor().clamp_(-0.99, 0.99)
-        # Keep underlying data in sync for all devices (including CPU)
-        a.actual_tensor().copy_(a.torch_tensor())
-
-    if inplace == Inplace.INPLACE_A:
-        if a_stride != y_stride:
-            return
-        y = a
-    else:
-        y = TestTensor(shape, y_stride, dtype, device, mode="ones")
-
-    if y.is_broadcast():
-        return
-
-    print(
-        f"Testing Atanh on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} y_stride:{y_stride} "
-        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
-    )
-
-    # Reference calculation
-    atanh_torch(y.torch_tensor(), a.torch_tensor())
-
-    if sync is not None:
-        sync()
-
-    # Create descriptor
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateAtanhDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            y.descriptor,
-            a.descriptor,
-        )
-    )
-
-    # Invalidate descriptors to ensure kernel uses its own internal state
-    for tensor in [a, y]:
-        tensor.destroy_desc()
-
-    # Workspace management
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetAtanhWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, y.device)
-
-    def lib_atanh():
-        check_error(
-            LIBINFINIOP.infiniopAtanh(
-                descriptor,
-                workspace.data(),
-                workspace.size(),
-                y.data(),
-                a.data(),
-                None,
-            )
-        )
-
-    # Run library function
-    lib_atanh()
-
-    # Validate results
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
-    
-    assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
-
-    # Profiling
-    if PROFILE:
-        profile_operation("PyTorch", lambda: atanh_torch(y.torch_tensor(), a.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("   lib", lambda: lib_atanh(), device, NUM_PRERUN, NUM_ITERATIONS)
-        
-    check_error(LIBINFINIOP.infiniopDestroyAtanhDescriptor(descriptor))
-
-if __name__ == "__main__":
-    args = get_args()
-
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mAtanh Test passed!\033[0m")
--- a/test/infiniop/binary_cross_entropy_with_logits.py
+++ b/test/infiniop/binary_cross_entropy_with_logits.py
-import torch
-import ctypes
-from ctypes import c_uint64, c_float, c_char_p
-from libinfiniop import (
-    LIBINFINIOP,
-    TestTensor,
-    get_test_devices,
-    check_error,
-    test_operator,
-    get_args,
-    get_tolerance,
-    profile_operation,
-    TestWorkspace,
-    InfiniDtype,
-    InfiniDtypeNames,
-    InfiniDeviceNames,
-    infiniopOperatorDescriptor_t,
-)
-
-# ==============================================================================
-# Configuration
-# ==============================================================================
-# 测试场景：(shape, has_weight, has_pos_weight, reduction)
-_TEST_CASES_DATA = [
-    ((4, 5), False, False, "none"),
-    ((8, 8), True, False, "sum"),
-    ((32, 512), False, True, "mean"),
-    ((16, 32, 64), True, True, "mean"), 
-]
-
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
-
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-2, "rtol": 5e-2},
-    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 5e-2},
-    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
-}
-
-_REDUCTIONS = ["none", "mean", "sum"]
-
-_REDUCTION_MAP = {
-    "none": 0,  # INFINIOP_REDUCTION_NONE
-    "mean": 1,  # INFINIOP_REDUCTION_MEAN
-    "sum": 2,   # INFINIOP_REDUCTION_SUM
-}
-
-# 生成最终测试用例组合
-_TEST_CASES = _TEST_CASES_DATA 
-
-DEBUG = False
-PROFILE = False
-
-def test(
-    handle,
-    device,
-    shape,
-    has_weight=False,
-    has_pos_weight=False,
-    reduction="none",
-    dtype=InfiniDtype.F16,
-    sync=None,
-):
-    print(
-        f"Testing BCEWithLogits on {InfiniDeviceNames[device]} shape:{shape} "
-        f"weight:{has_weight} pos_weight:{has_pos_weight} reduction:{reduction} dtype:{InfiniDtypeNames[dtype]}"
-    )
-
-    # 1. 准备输入 Tensor
-    input_tensor = TestTensor(shape, None, dtype, device)
-    target = TestTensor(shape, None, dtype, device)
-    
-    weight = TestTensor(shape, None, dtype, device) if has_weight else None
-    # pos_weight 通常在最后一维广播，形状为 (C,)
-    pos_weight_shape = (shape[-1],)
-    pos_weight = TestTensor(pos_weight_shape, None, dtype, device) if has_pos_weight else None
-
-    # 2. 使用 PyTorch 计算参考答案
-    torch_input = input_tensor.torch_tensor()
-    torch_target = target.torch_tensor()
-    torch_weight = weight.torch_tensor() if has_weight else None
-    torch_pos_weight = pos_weight.torch_tensor() if has_pos_weight else None
-
-    ans = torch.nn.functional.binary_cross_entropy_with_logits(
-        torch_input, 
-        torch_target, 
-        weight=torch_weight, 
-        pos_weight=torch_pos_weight, 
-        reduction=reduction
-    )
-
-    # 3. 准备输出 Tensor (根据 reduction 确定形状)
-    out_shape = () if reduction != "none" else shape
-    out = TestTensor(out_shape, None, dtype, device)
-
-    if sync is not None:
-        sync()
-
-    # 4. 创建描述符并执行
-    descriptor = infiniopOperatorDescriptor_t()
-    
-    # 模拟 C 接口调用
-    reduction_enum = _REDUCTION_MAP[reduction]
-    check_error(
-        LIBINFINIOP.infiniopCreateBCEWithLogitsDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            out.descriptor,
-            input_tensor.descriptor,
-            target.descriptor,
-            weight.descriptor if has_weight else None,
-            pos_weight.descriptor if has_pos_weight else None,
-            reduction_enum  # 传入归约方式枚举值，对应 infiniopReduction_t
-        )
-    )
-
-    # 销毁临时描述符
-    for t in [input_tensor, target, out]:
-        t.destroy_desc()
-    if weight: weight.destroy_desc()
-    if pos_weight: pos_weight.destroy_desc()
-
-    workspace_size = c_uint64(0)
-    check_error(LIBINFINIOP.infiniopGetBCEWithLogitsWorkspaceSize(descriptor, ctypes.byref(workspace_size)))
-    workspace = TestWorkspace(workspace_size.value, device)
-
-    def lib_op():
-        check_error(
-            LIBINFINIOP.infiniopBCEWithLogits(
-                descriptor,
-                workspace.data(),
-                workspace_size.value,
-                out.data(),
-                input_tensor.data(),
-                target.data(),
-                weight.data() if has_weight else None,
-                pos_weight.data() if has_pos_weight else None,
-                None,
-            )
-        )
-
-    lib_op()
-
-    if sync is not None:
-        sync()
-
-    # 5. 验证结果
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    assert torch.allclose(out.actual_tensor(), ans, atol=atol, rtol=rtol)
-
-    if PROFILE:
-        profile_operation("PyTorch", lambda: torch.nn.functional.binary_cross_entropy_with_logits(
-            torch_input, torch_target, weight=torch_weight, pos_weight=torch_pos_weight, reduction=reduction
-        ), device)
-        profile_operation("   lib", lib_op, device)
-
-    check_error(LIBINFINIOP.infiniopDestroyBCEWithLogitsDescriptor(descriptor))
-
-if __name__ == "__main__":
-    args = get_args()
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-    print("\033[92mBCEWithLogits tests passed!\033[0m")
--- a/test/infiniop/cdist.py
+++ b/test/infiniop/cdist.py
-import torch
-import ctypes
-from ctypes import c_uint64, c_float, c_double
-from libinfiniop import (
-    LIBINFINIOP,
-    TestTensor,
-    get_test_devices,
-    check_error,
-    test_operator,
-    get_args,
-    debug,
-    get_tolerance,
-    profile_operation,
-    TestWorkspace,
-    InfiniDtype,
-    InfiniDtypeNames,
-    InfiniDeviceNames,
-    infiniopOperatorDescriptor_t,
-)
-
-# ==============================================================================
-# Configuration
-# ==============================================================================
-# 格式: (M, N, D, x1_stride, x2_stride)
-# x1: (M, D), x2: (N, D), out: (M, N)
-_TEST_CASES_DATA = [
-    (5, 6, 3, None, None),
-    (32, 64, 128, None, None),
-    (32, 64, 128, (256, 1), (256, 1)), # 测试带步长的输入
-    (10, 7, 5, None, None),
-]
-
-_TENSOR_DTYPES = [InfiniDtype.F32] # cdist 通常对精度敏感，初测建议用 F32
-
-_TOLERANCE_MAP = {
-    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-4},
-}
-
-_P_VALUES = [1.0, 2.0, float("inf")] # 不同的 p 范数测试
-
-_TEST_CASES = [
-    test_case + (p_val,)
-    for test_case in _TEST_CASES_DATA
-    for p_val in _P_VALUES
-]
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 100
-
-def test(
-    handle,
-    device,
-    M, N, D,
-    x1_stride=None,
-    x2_stride=None,
-    p=2.0,
-    dtype=InfiniDtype.F32,
-    sync=None,
-):
-    print(
-        f"Testing Cdist on {InfiniDeviceNames[device]} with M:{M}, N:{N}, D:{D}, p:{p}, dtype:{InfiniDtypeNames[dtype]}"
-    )
-
-    # 1. 准备输入输出形状
-    x1_shape = (M, D)
-    x2_shape = (N, D)
-    out_shape = (M, N)
-
-    # 2. 准备输入 Tensor
-    x1 = TestTensor(x1_shape, x1_stride, dtype, device)
-    x2 = TestTensor(x2_shape, x2_stride, dtype, device)
-    out = TestTensor(out_shape, None, dtype, device)
-
-    # 3. 使用 PyTorch 计算参考答案
-    # torch.cdist 要求输入至少是 2D
-    ans = torch.cdist(x1.torch_tensor(), x2.torch_tensor(), p=p)
-
-    if sync is not None:
-        sync()
-
-    # 4. 创建算子描述符
-    descriptor = infiniopOperatorDescriptor_t()
-    # 注意：这里假设 C 接口名为 infiniopCreateCdistDescriptor
-    check_error(
-        LIBINFINIOP.infiniopCreateCdistDescriptor(
-            handle, 
-            ctypes.byref(descriptor), 
-            out.descriptor, 
-            x1.descriptor, 
-            x2.descriptor, 
-            c_double(p) # 通常 p 使用 double 或 float 传递
-        )
-    )
-
-    # 销毁临时描述符以防内核错误引用（沿用 addcmul 风格）
-    for t in [x1, x2, out]:
-        t.destroy_desc()
-
-    # 5. Workspace 准备
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetCdistWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, x1.device)
-
-    # 6. 执行函数定义
-    def lib_cdist():
-        check_error(
-            LIBINFINIOP.infiniopCdist(
-                descriptor,
-                workspace.data(),
-                workspace_size.value,
-                out.data(),
-                x1.data(),
-                x2.data(),
-                None, # stream
-            )
-        )
-
-    # 7. 运行
-    lib_cdist()
-
-    if sync is not None:
-        sync()
-
-    # 8. 验证结果
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(out.actual_tensor(), ans, atol=atol, rtol=rtol)
-    
-    assert torch.allclose(out.actual_tensor(), ans, atol=atol, rtol=rtol)
-
-    # 9. 性能分析
-    if PROFILE:
-        profile_operation("PyTorch", lambda: torch.cdist(x1.torch_tensor(), x2.torch_tensor(), p=p), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("   lib", lambda: lib_cdist(), device, NUM_PRERUN, NUM_ITERATIONS)
-
-    check_error(LIBINFINIOP.infiniopDestroyCdistDescriptor(descriptor))
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mCdist tests passed!\033[0m")
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -4,7 +4,8 @@ from .structs import (
    infiniopOperatorDescriptor_t,
 )

-from ctypes import c_int32, c_void_p, c_size_t, POINTER, c_float, c_double, c_uint64
+from ctypes import c_int32, c_void_p, c_size_t, POINTER, c_float
+

 class OpRegister:
    registry = []
@@ -19,36 +20,6 @@ class OpRegister:
        for op in cls.registry:
            op(lib)

-@OpRegister.operator
-def atanh_(lib):
-    lib.infiniopCreateAtanhDescriptor.restype = c_int32
-    lib.infiniopCreateAtanhDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopOperatorDescriptor_t),
-        infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
-    ]
-
-    lib.infiniopGetAtanhWorkspaceSize.restype = c_int32
-    lib.infiniopGetAtanhWorkspaceSize.argtypes = [
-        infiniopOperatorDescriptor_t,
-        POINTER(c_size_t),
-    ]
-
-    lib.infiniopAtanh.restype = c_int32
-    lib.infiniopAtanh.argtypes = [
-        infiniopOperatorDescriptor_t,
-        c_void_p,  # workspace
-        c_size_t,  # workspace_size
-        c_void_p,  # y_data
-        c_void_p,  # a_data
-        c_void_p,  # stream
-    ]
-
-    lib.infiniopDestroyAtanhDescriptor.restype = c_int32
-    lib.infiniopDestroyAtanhDescriptor.argtypes = [
-        infiniopOperatorDescriptor_t,
-    ]

 @OpRegister.operator
 def add_(lib):
@@ -84,156 +55,6 @@ def add_(lib):
    ]

 @OpRegister.operator
-def addcmul_(lib):
-    lib.infiniopCreateAddcmulDescriptor.restype = c_int32
-    lib.infiniopCreateAddcmulDescriptor.argtypes = [
-        infiniopHandle_t,                       # handle
-        POINTER(infiniopOperatorDescriptor_t),  # desc_ptr
-        infiniopTensorDescriptor_t,             # out_desc
-        infiniopTensorDescriptor_t,             # input_desc
-        infiniopTensorDescriptor_t,             # t1_desc
-        infiniopTensorDescriptor_t,             # t2_desc
-        c_float,                                # value (标量系数)
-    ]
-
-    lib.infiniopGetAddcmulWorkspaceSize.restype = c_int32
-    lib.infiniopGetAddcmulWorkspaceSize.argtypes = [
-        infiniopOperatorDescriptor_t,           # descriptor
-        POINTER(c_size_t),                      # size_ptr
-    ]
-
-    lib.infiniopAddcmul.restype = c_int32
-    lib.infiniopAddcmul.argtypes = [
-        infiniopOperatorDescriptor_t,           # descriptor
-        c_void_p,                               # workspace
-        c_size_t,                               # workspace_size
-        c_void_p,                               # out_ptr
-        c_void_p,                               # input_ptr
-        c_void_p,                               # t1_ptr
-        c_void_p,                               # t2_ptr
-        c_void_p,                               # stream
-    ]
-
-    lib.infiniopDestroyAddcmulDescriptor.restype = c_int32
-    lib.infiniopDestroyAddcmulDescriptor.argtypes = [
-        infiniopOperatorDescriptor_t,           # descriptor
-    ]
-    
-@OpRegister.operator
-def cdist_(lib):
-    # 1. 创建描述符接口
-    # 接口通常接收 handle, 输出 desc, 两个输入 desc, 以及范数 p
-    lib.infiniopCreateCdistDescriptor.restype = c_int32
-    lib.infiniopCreateCdistDescriptor.argtypes = [
-        infiniopHandle_t,                       # handle
-        POINTER(infiniopOperatorDescriptor_t),  # desc_ptr
-        infiniopTensorDescriptor_t,             # y_desc (输出)
-        infiniopTensorDescriptor_t,             # x1_desc
-        infiniopTensorDescriptor_t,             # x2_desc
-        c_double,                               # p (范数阶数)
-    ]
-
-    # 2. 获取 Workspace 大小接口
-    lib.infiniopGetCdistWorkspaceSize.restype = c_int32
-    lib.infiniopGetCdistWorkspaceSize.argtypes = [
-        infiniopOperatorDescriptor_t,           # descriptor
-        POINTER(c_size_t),                      # size_ptr
-    ]
-
-    # 3. 执行算子接口
-    lib.infiniopCdist.restype = c_int32
-    lib.infiniopCdist.argtypes = [
-        infiniopOperatorDescriptor_t,           # descriptor
-        c_void_p,                               # workspace
-        c_size_t,                               # workspace_size
-        c_void_p,                               # y_ptr
-        c_void_p,                               # x1_ptr
-        c_void_p,                               # x2_ptr
-        c_void_p,                               # stream
-    ]
-
-    # 4. 销毁描述符接口
-    lib.infiniopDestroyCdistDescriptor.restype = c_int32
-    lib.infiniopDestroyCdistDescriptor.argtypes = [
-        infiniopOperatorDescriptor_t,           # descriptor
-    ]
-
-@OpRegister.operator
-def binary_cross_entropy_with_logits_(lib):
-    # 1. 创建描述符 (Descriptor Creation)
-    lib.infiniopCreateBCEWithLogitsDescriptor.restype = c_int32
-    lib.infiniopCreateBCEWithLogitsDescriptor.argtypes = [
-        infiniopHandle_t,                        # handle
-        POINTER(infiniopOperatorDescriptor_t),   # desc_ptr
-        infiniopTensorDescriptor_t,              # out_desc
-        infiniopTensorDescriptor_t,              # input_desc (logits)
-        infiniopTensorDescriptor_t,              # target_desc
-        infiniopTensorDescriptor_t,              # weight_desc (可选，不可用则传 NULL)
-        infiniopTensorDescriptor_t,              # pos_weight_desc (可选，不可用则传 NULL)
-        c_int32                                  # reduction (0:none, 1:mean, 2:sum)
-    ]
-
-    # 2. 获取工作空间大小 (Workspace Size)
-    lib.infiniopGetBCEWithLogitsWorkspaceSize.restype = c_int32
-    lib.infiniopGetBCEWithLogitsWorkspaceSize.argtypes = [
-        infiniopOperatorDescriptor_t,            # descriptor
-        POINTER(c_size_t),                       # size_ptr
-    ]
-
-    # 3. 执行算子 (Execution)
-    lib.infiniopBCEWithLogits.restype = c_int32
-    lib.infiniopBCEWithLogits.argtypes = [
-        infiniopOperatorDescriptor_t,            # descriptor
-        c_void_p,                                # workspace
-        c_size_t,                                # workspace_size
-        c_void_p,                                # out_ptr
-        c_void_p,                                # input_ptr (logits)
-        c_void_p,                                # target_ptr
-        c_void_p,                                # weight_ptr (可选)
-        c_void_p,                                # pos_weight_ptr (可选)
-        c_void_p,                                # stream
-    ]
-
-    # 4. 销毁描述符 (Destruction)
-    lib.infiniopDestroyBCEWithLogitsDescriptor.restype = c_int32
-    lib.infiniopDestroyBCEWithLogitsDescriptor.argtypes = [
-        infiniopOperatorDescriptor_t,            # descriptor
-    ]
-
-@OpRegister.operator
-def reciprocal_(lib):
-    lib.infiniopCreateReciprocalDescriptor.restype = c_int32
-    lib.infiniopCreateReciprocalDescriptor.argtypes = [
-        infiniopHandle_t,
-        POINTER(infiniopOperatorDescriptor_t),
-        infiniopTensorDescriptor_t, # Output descriptor
-        infiniopTensorDescriptor_t, # Input descriptor
-    ]
-
-    # 获取工作空间大小接口
-    lib.infiniopGetReciprocalWorkspaceSize.restype = c_int32
-    lib.infiniopGetReciprocalWorkspaceSize.argtypes = [
-        infiniopOperatorDescriptor_t,
-        POINTER(c_size_t),
-    ]
-
-    # 最后的 c_void_p 通常对应 stream 或其他异步句柄，保持一致即可
-    lib.infiniopReciprocal.restype = c_int32
-    lib.infiniopReciprocal.argtypes = [
-        infiniopOperatorDescriptor_t,
-        c_void_p, # Workspace pointer
-        c_size_t, # Workspace size
-        c_void_p, # Output data pointer
-        c_void_p, # Input data pointer
-        c_void_p, # Stream pointer (optional)
-    ]
-
-    # 销毁描述符接口
-    lib.infiniopDestroyReciprocalDescriptor.restype = c_int32
-    lib.infiniopDestroyReciprocalDescriptor.argtypes = [
-        infiniopOperatorDescriptor_t,
-    ]
-@OpRegister.operator
 def equal_(lib):
    # =========================================================
    # 1. 注册 Create 函数

--- a/test/infiniop/reciprocal.py
+++ b/test/infiniop/reciprocal.py
-import torch
-import ctypes
-from ctypes import c_uint64
-from libinfiniop import (
-    LIBINFINIOP,
-    TestTensor,
-    get_test_devices,
-    check_error,
-    test_operator,
-    get_args,
-    debug,
-    get_tolerance,
-    profile_operation,
-    TestWorkspace,
-    InfiniDtype,
-    InfiniDtypeNames,
-    InfiniDeviceNames,
-    infiniopOperatorDescriptor_t,
-)
-from enum import Enum, auto
-
-# ==============================================================================
-# Configuration
-# ==============================================================================
-_TEST_CASES_ = [
-    # shape, input_stride, output_stride
-    ((13, 4), None, None),
-    ((13, 4), (10, 1), (10, 1)),
-    ((16, 5632), None, None),
-    ((16, 5632), (13312, 1), (13312, 1)),
-    ((13, 16, 2), (128, 4, 1), (64, 4, 1)),
-    ((4, 4, 5632), None, None),
-]
-
-class Inplace(Enum):
-    OUT_OF_PLACE = auto()
-    INPLACE = auto()
-
-_INPLACE = [
-    Inplace.OUT_OF_PLACE,
-    Inplace.INPLACE,
-]
-
-_TEST_CASES = [
-    test_case + (inplace_item,)
-    for test_case in _TEST_CASES_
-    for inplace_item in _INPLACE
-]
-
-# Reciprocal usually outputs floats; Integer types are often not supported or special-cased
-_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
-
-_TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
-    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
-    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
-}
-
-DEBUG = False
-PROFILE = False
-NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
-
-def reciprocal(y, x):
-    torch.reciprocal(x, out=y)
-
-def test(
-    handle,
-    device,
-    shape,
-    in_stride=None,
-    out_stride=None,
-    inplace=Inplace.OUT_OF_PLACE,
-    dtype=torch.float16,
-    sync=None,
-):
-    # Initialize input 'x'
-    # Use 'random' mode but ensure values are not near zero to avoid infinity
-    x = TestTensor(shape, in_stride, dtype, device)
-    
-    if inplace == Inplace.INPLACE:
-        if in_stride != out_stride:
-            return
-        y = x
-    else:
-        y = TestTensor(shape, out_stride, dtype, device)
-
-    if y.is_broadcast():
-        return
-
-    print(
-        f"Testing Reciprocal on {InfiniDeviceNames[device]} with shape:{shape} "
-        f"in_stride:{in_stride} out_stride:{out_stride} "
-        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
-    )
-
-    # Calculate ground truth using PyTorch
-    reciprocal(y.torch_tensor(), x.torch_tensor())
-
-    if sync is not None:
-        sync()
-
-    # Create Descriptor
-    descriptor = infiniopOperatorDescriptor_t()
-    check_error(
-        LIBINFINIOP.infiniopCreateReciprocalDescriptor(
-            handle,
-            ctypes.byref(descriptor),
-            y.descriptor,
-            x.descriptor,
-        )
-    )
-
-    # Invalidate descriptors as per framework requirement
-    for tensor in [x, y]:
-        tensor.destroy_desc()
-
-    # Workspace allocation
-    workspace_size = c_uint64(0)
-    check_error(
-        LIBINFINIOP.infiniopGetReciprocalWorkspaceSize(
-            descriptor, ctypes.byref(workspace_size)
-        )
-    )
-    workspace = TestWorkspace(workspace_size.value, y.device)
-
-    def lib_reciprocal():
-        check_error(
-            LIBINFINIOP.infiniopReciprocal(
-                descriptor,
-                workspace.data(),
-                workspace.size(),
-                y.data(),
-                x.data(),
-                None,
-            )
-        )
-
-    lib_reciprocal()
-
-    # Verification
-    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
-    if DEBUG:
-        debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
-    assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
-
-    # Profiling
-    if PROFILE:
-        profile_operation("PyTorch", lambda: reciprocal(y.torch_tensor(), x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
-        profile_operation("   lib", lambda: lib_reciprocal(), device, NUM_PRERUN, NUM_ITERATIONS)
-        
-    check_error(LIBINFINIOP.infiniopDestroyReciprocalDescriptor(descriptor))
-
-if __name__ == "__main__":
-    args = get_args()
-
-    DEBUG = args.debug
-    PROFILE = args.profile
-    NUM_PRERUN = args.num_prerun
-    NUM_ITERATIONS = args.num_iterations
-
-    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-
-    print("\033[92mTest passed!\033[0m")