Revert "Merge pull request #1069 from InfiniTensor/issue/1031_T1_1_15"

This reverts commit 21c6af2d, reversing changes made to 99a802dd.

Revert "Merge pull request #1069 from InfiniTensor/issue/1031_T1_1_15"
This reverts commit 21c6af2d, reversing changes made to 99a802dd.
18773b69 · wooway777 · bfead271 · bfead271 · bfead271 · bfead271
Commit 18773b69 authored Mar 13, 2026 by wooway777
20 changed files
--- a/python/infinicore/ops/reciprocal.py
+++ b/python/infinicore/ops/reciprocal.py
-from infinicore.lib import _infinicore
-from infinicore.tensor import Tensor
-
-
-def reciprocal(input, *, out=None):
-    if out is None:
-        return Tensor(_infinicore.reciprocal(input._underlying))
-
-    _infinicore.reciprocal_(out._underlying, input._underlying)
-
-    return out
--- a/src/infinicore/ops/addcmul/addcmul.cc
+++ b/src/infinicore/ops/addcmul/addcmul.cc
-#include "infinicore/ops/addcmul.hpp"
-#include "../../utils.hpp"
-
-namespace infinicore::op {
-
-common::OpDispatcher<Addcmul::schema> &Addcmul::dispatcher() {
-    static common::OpDispatcher<Addcmul::schema> dispatcher_;
-    return dispatcher_;
-};
-
-// 执行核心逻辑：设备校验与后端分发
-void Addcmul::execute(Tensor out, Tensor input, Tensor t1, Tensor t2, float value) {
-    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, input, t1, t2);
-    infinicore::context::setDevice(out->device());
-    dispatcher().lookup(out->device().getType())(out, input, t1, t2, value);
-}
-
-// Out-of-place 接口：自动创建输出 Tensor
-Tensor addcmul(Tensor input, Tensor t1, Tensor t2, float value) {
-    auto out = Tensor::empty(input->shape(), input->dtype(), input->device());
-    addcmul_(out, input, t1, t2, value);
-    return out;
-}
-
-void addcmul_(Tensor out, Tensor input, Tensor t1, Tensor t2, float value) {
-    Addcmul::execute(out, input, t1, t2, value);
-}
-
-} // namespace infinicore::op
--- a/src/infinicore/ops/addcmul/addcmul_infiniop.cc
+++ b/src/infinicore/ops/addcmul/addcmul_infiniop.cc
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/addcmul.hpp"
-#include "infinicore/ops/common/cache.hpp"
-#include <infiniop.h>
-
-namespace infinicore::op::addcmul_impl::infiniop {
-
-// 定义线程局部的算子描述符缓存
-thread_local common::OpCache<size_t, infiniopAddcmulDescriptor_t> caches(
-    100,
-    [](infiniopAddcmulDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyAddcmulDescriptor(desc));
-            desc = nullptr;
-        }
-    });
-
-void calculate(Tensor out, Tensor input, Tensor t1, Tensor t2, float value) {
-    size_t seed = hash_combine(out, input, t1, t2, value);
-
-    auto device = context::getDevice();
-    auto &cache = caches.getCache(device);
-
-    auto desc_opt = cache.get(seed);
-    infiniopAddcmulDescriptor_t desc = nullptr;
-
-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateAddcmulDescriptor(
-            context::getInfiniopHandle(device), &desc,
-            out->desc(), input->desc(), t1->desc(), t2->desc(), value));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
-
-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetAddcmulWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
-
-    INFINICORE_CHECK_ERROR(infiniopAddcmul(
-        desc, workspace->data(), workspace_size,
-        out->data(), input->data(), t1->data(), t2->data(), context::getStream()));
-}
-
-static bool registered = []() {
-    Addcmul::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
-
-} // namespace infinicore::op::addcmul_impl::infiniop
--- a/src/infinicore/ops/atanh/atanh.cc
+++ b/src/infinicore/ops/atanh/atanh.cc
-#include "infinicore/ops/atanh.hpp"
-#include "../../utils.hpp"
-
-namespace infinicore::op {
-
-// 获取单例分发器
-common::OpDispatcher<Atanh::schema> &Atanh::dispatcher() {
-    static common::OpDispatcher<Atanh::schema> dispatcher_;
-    return dispatcher_;
-};
-
-// 执行入口：负责设备切换和后端查找
-void Atanh::execute(Tensor y, Tensor a) {
-    // 确保输入和输出在同一个设备上
-    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(y, a);
-
-    // 切换当前上下文到目标设备
-    infinicore::context::setDevice(y->device());
-
-    // 根据设备类型（CPU/CUDA等）查找对应的实现并执行
-    dispatcher().lookup(y->device().getType())(y, a);
-}
-
-// Out-of-place 接口：自动创建结果 Tensor
-Tensor atanh(Tensor a) {
-    // 创建一个与输入形状、类型、设备完全相同的空 Tensor
-    auto y = Tensor::empty(a->shape(), a->dtype(), a->device());
-    atanh_(y, a);
-    return y;
-}
-
-// In-place 或指定输出接口
-void atanh_(Tensor y, Tensor a) {
-    Atanh::execute(y, a);
-}
-
-} // namespace infinicore::op
--- a/src/infinicore/ops/atanh/atanh_infiniop.cc
+++ b/src/infinicore/ops/atanh/atanh_infiniop.cc
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/atanh.hpp"
-#include "infinicore/ops/common/cache.hpp"
-#include <infiniop.h>
-
-namespace infinicore::op::atanh_impl::infiniop {
-
-// 定义线程局部的算子描述符缓存，避免重复创建 Descriptor 带来的开销
-thread_local common::OpCache<size_t, infiniopAtanhDescriptor_t> caches(
-    100, // 缓存容量
-    [](infiniopAtanhDescriptor_t &desc) {
-        if (desc != nullptr) {
-            // 缓存释放时的回调：销毁 infiniop 算子描述符
-            INFINICORE_CHECK_ERROR(infiniopDestroyAtanhDescriptor(desc));
-            desc = nullptr;
-        }
-    });
-
-void calculate(Tensor y, Tensor a) {
-    // 1. 根据 Tensor 的形状、步长、类型等信息生成唯一 Hash 值
-    size_t seed = hash_combine(y, a);
-
-    auto device = context::getDevice();
-    auto &cache = caches.getCache(device);
-
-    // 2. 尝试从缓存中获取已存在的描述符
-    auto desc_opt = cache.get(seed);
-    infiniopAtanhDescriptor_t desc = nullptr;
-
-    if (!desc_opt) {
-        // 如果缓存未命中，创建新的描述符
-        INFINICORE_CHECK_ERROR(infiniopCreateAtanhDescriptor(
-            context::getInfiniopHandle(device), &desc,
-            y->desc(), a->desc()));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
-
-    // 3. 获取并分配必要的 Workspace 空间（如果有的话）
-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetAtanhWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
-
-    // 4. 执行底层计算
-    INFINICORE_CHECK_ERROR(infiniopAtanh(
-        desc, workspace->data(), workspace_size,
-        y->data(), a->data(), context::getStream()));
-}
-
-// 5. 自动注册逻辑：程序启动时将此实现注册到分发器中
-static bool registered = []() {
-    Atanh::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
-
-} // namespace infinicore::op::atanh_impl::infiniop
--- a/src/infinicore/ops/binary_cross_entropy_with_logits/binary_cross_entropy_with_logits.cc
+++ b/src/infinicore/ops/binary_cross_entropy_with_logits/binary_cross_entropy_with_logits.cc
-#include "infinicore/ops/binary_cross_entropy_with_logits.hpp"
-#include "../../utils.hpp"
-
-namespace infinicore::op {
-
-// 静态调度器实例化
-common::OpDispatcher<BinaryCrossEntropyWithLogits::schema> &BinaryCrossEntropyWithLogits::dispatcher() {
-    static common::OpDispatcher<BinaryCrossEntropyWithLogits::schema> dispatcher_;
-    return dispatcher_;
-};
-
-/**
- * 执行核心逻辑：设备校验、上下文设置与后端分发
- */
-void BinaryCrossEntropyWithLogits::execute(Tensor out, Tensor logits, Tensor target, Tensor weight, Tensor pos_weight, std::string reduction) {
-    // 1. 校验所有已定义的 Tensor 是否在同一设备上
-    // 使用宏或循环校验 logits, target, out 以及可选的 weight/pos_weight
-    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, logits, target);
-    if (weight) {
-        INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, weight);
-    }
-    if (pos_weight) {
-        INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, pos_weight);
-    }
-
-    // 2. 设置当前设备上下文
-    infinicore::context::setDevice(out->device());
-
-    // 3. 根据设备类型查找并执行具体的后端实现（如 CUDA 或 CPU 实现）
-    dispatcher().lookup(out->device().getType())(out, logits, target, weight, pos_weight, reduction);
-}
-
-/**
- * Out-of-place 接口：根据 reduction 自动创建输出 Tensor
- */
-Tensor binary_cross_entropy_with_logits(Tensor logits, Tensor target, Tensor weight, Tensor pos_weight, std::string reduction) {
-    std::vector<uint64_t> out_shape;
-
-    // 1. 根据归约方式确定输出形状
-    if (reduction == "none") {
-        // 不归约，形状与输入 logits 一致
-        auto in_shape = logits->shape();
-        for (auto dim : in_shape) {
-            out_shape.push_back(static_cast<uint64_t>(dim));
-        }
-    } else {
-        // mean 或 sum 归约，输出为标量 (空 shape 向量表示 0-dim tensor)
-        out_shape = {};
-    }
-
-    // 2. 创建输出 Tensor
-    auto out = Tensor::empty(out_shape, logits->dtype(), logits->device());
-
-    // 3. 调用显式接口执行计算
-    binary_cross_entropy_with_logits_(out, logits, target, weight, pos_weight, reduction);
-
-    return out;
-}
-
-/**
- * 显式指定输出接口
- */
-void binary_cross_entropy_with_logits_(Tensor out, Tensor logits, Tensor target, Tensor weight, Tensor pos_weight, std::string reduction) {
-    BinaryCrossEntropyWithLogits::execute(out, logits, target, weight, pos_weight, reduction);
-}
-
-} // namespace infinicore::op
--- a/src/infinicore/ops/binary_cross_entropy_with_logits/binary_cross_entropy_with_logits_infiniop.cc
+++ b/src/infinicore/ops/binary_cross_entropy_with_logits/binary_cross_entropy_with_logits_infiniop.cc
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/binary_cross_entropy_with_logits.hpp"
-#include "infinicore/ops/common/cache.hpp"
-#include <infiniop.h>
-
-namespace infinicore::op::bce_logits_impl::infiniop {
-
-// 定义线程局部的 BCEWithLogits 算子描述符缓存
-thread_local common::OpCache<size_t, infiniopBCEWithLogitsDescriptor_t> caches(
-    100,
-    [](infiniopBCEWithLogitsDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyBCEWithLogitsDescriptor(desc));
-            desc = nullptr;
-        }
-    });
-
-/**
- * @brief 执行 BCEWithLogits 计算
- * @param out 输出 Tensor (根据 reduction 可能是标量或与 logits 同形状)
- * @param logits 预测值 Tensor
- * @param target 标签 Tensor
- * @param weight 样本权重 Tensor (可选)
- * @param pos_weight 正类权重 Tensor (可选)
- * @param reduction_str 归约方式 ("none", "mean", "sum")
- */
-void calculate(Tensor out, Tensor logits, Tensor target, Tensor weight, Tensor pos_weight, std::string reduction_str) {
-    // 1. 将字符串归约参数转换为底层 API 使用的枚举值
-    infiniopReduction_t reduction;
-    if (reduction_str == "none") {
-        reduction = INFINIOP_REDUCTION_NONE;
-    } else if (reduction_str == "mean") {
-        reduction = INFINIOP_REDUCTION_MEAN;
-    } else if (reduction_str == "sum") {
-        reduction = INFINIOP_REDUCTION_SUM;
-    } else {
-        throw std::runtime_error("Unknown reduction mode: " + reduction_str);
-    }
-
-    // 2. 生成唯一 Hash Seed 用于缓存查找
-    // 包含所有输入 Tensor 的状态和 reduction 参数，确保缓存键的唯一性
-    size_t seed = hash_combine(out, logits, target, weight, pos_weight, static_cast<int>(reduction));
-
-    auto device = context::getDevice();
-    auto &cache = caches.getCache(device);
-
-    auto desc_opt = cache.get(seed);
-    infiniopBCEWithLogitsDescriptor_t desc = nullptr;
-
-    // 3. 如果缓存未命中，创建新的描述符并存入缓存
-    if (!desc_opt) {
-        // 获取可选 Tensor 的描述符，若未定义则传 nullptr
-        auto weight_desc = weight ? weight->desc() : nullptr;
-        auto pos_weight_desc = pos_weight ? pos_weight->desc() : nullptr;
-
-        INFINICORE_CHECK_ERROR(infiniopCreateBCEWithLogitsDescriptor(
-            context::getInfiniopHandle(device),
-            &desc,
-            out->desc(),
-            logits->desc(),
-            target->desc(),
-            weight_desc,
-            pos_weight_desc,
-            reduction));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
-
-    // 4. 动态获取并分配 Workspace 临时内存
-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetBCEWithLogitsWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
-
-    // 5. 获取数据指针，处理可选 Tensor 的空指针逻辑
-    const void *weight_ptr = weight ? weight->data() : nullptr;
-    const void *pos_weight_ptr = pos_weight ? pos_weight->data() : nullptr;
-
-    // 6. 执行底层算子
-    INFINICORE_CHECK_ERROR(infiniopBCEWithLogits(
-        desc,
-        workspace->data(),
-        workspace_size,
-        out->data(),
-        logits->data(),
-        target->data(),
-        weight_ptr,
-        pos_weight_ptr,
-        context::getStream()));
-}
-
-// 7. 自动注册到调度器 (Dispatcher)
-static bool registered = []() {
-    BinaryCrossEntropyWithLogits::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
-
-} // namespace infinicore::op::bce_logits_impl::infiniop
--- a/src/infinicore/ops/cdist/cdist.cc
+++ b/src/infinicore/ops/cdist/cdist.cc
-#include "infinicore/ops/cdist.hpp"
-#include "../../utils.hpp"
-
-namespace infinicore::op {
-
-// 静态调度器实例化
-common::OpDispatcher<Cdist::schema> &Cdist::dispatcher() {
-    static common::OpDispatcher<Cdist::schema> dispatcher_;
-    return dispatcher_;
-};
-
-/**
- * 执行核心逻辑：设备校验与后端分发
- */
-void Cdist::execute(Tensor out, Tensor x1, Tensor x2, double p) {
-    // 校验三个 Tensor 是否在同一设备上
-    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, x1, x2);
-
-    // 设置当前设备上下文
-    infinicore::context::setDevice(out->device());
-
-    // 根据设备类型（CUDA/CPU/etc.）查找并执行注册的算子实现
-    dispatcher().lookup(out->device().getType())(out, x1, x2, p);
-}
-
-/**
- * Out-of-place 接口：自动创建输出 Tensor
- * x1: (M, D), x2: (N, D) -> out: (M, N)
- */
-Tensor cdist(Tensor x1, Tensor x2, double p) {
-    // 1. 获取输入维度
-    auto shape1 = x1->shape(); // 假设为 {M, D}
-    auto shape2 = x2->shape(); // 假设为 {N, D}
-
-    // 将原来的 std::vector<int64_t> 修改为 std::vector<uint64_t>
-    std::vector<uint64_t> out_shape = {
-        static_cast<uint64_t>(shape1[0]),
-        static_cast<uint64_t>(shape2[0])};
-
-    // 或者使用更简洁的初始化列表方式，强制转换类型
-    auto out = Tensor::empty({(uint64_t)shape1[0], (uint64_t)shape2[0]}, x1->dtype(), x1->device());
-
-    // 5. 调用执行接口
-    cdist_(out, x1, x2, p);
-
-    return out;
-}
-
-/**
- * 显式指定输出接口
- */
-void cdist_(Tensor out, Tensor x1, Tensor x2, double p) {
-    Cdist::execute(out, x1, x2, p);
-}
-
-} // namespace infinicore::op
--- a/src/infinicore/ops/cdist/cdist_infiniop.cc
+++ b/src/infinicore/ops/cdist/cdist_infiniop.cc
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/cdist.hpp"
-#include "infinicore/ops/common/cache.hpp"
-#include <infiniop.h>
-
-namespace infinicore::op::cdist_impl::infiniop {
-
-// 定义线程局部的 cdist 算子描述符缓存
-// 缓存 key 为输入 Tensor 描述信息及参数 p 的哈希值
-thread_local common::OpCache<size_t, infiniopCdistDescriptor_t> caches(
-    100,
-    [](infiniopCdistDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyCdistDescriptor(desc));
-            desc = nullptr;
-        }
-    });
-
-void calculate(Tensor out, Tensor x1, Tensor x2, double p) {
-    // 1. 生成唯一 Hash Seed 用于缓存查找
-    size_t seed = hash_combine(out, x1, x2, p);
-
-    auto device = context::getDevice();
-    auto &cache = caches.getCache(device);
-
-    auto desc_opt = cache.get(seed);
-    infiniopCdistDescriptor_t desc = nullptr;
-
-    // 2. 如果缓存未命中，创建新的描述符并存入缓存
-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateCdistDescriptor(
-            context::getInfiniopHandle(device),
-            &desc,
-            out->desc(),
-            x1->desc(),
-            x2->desc(),
-            p));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
-
-    // 3. 动态获取并分配 Workspace 临时内存
-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetCdistWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
-
-    // 4. 执行底层算子
-    INFINICORE_CHECK_ERROR(infiniopCdist(
-        desc,
-        workspace->data(),
-        workspace_size,
-        out->data(),
-        x1->data(),
-        x2->data(),
-        context::getStream()));
-}
-
-// 5. 自动注册到调度器 (Dispatcher)
-static bool registered = []() {
-    Cdist::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
-
-} // namespace infinicore::op::cdist_impl::infiniop
--- a/src/infinicore/ops/reciprocal/reciprocal.cc
+++ b/src/infinicore/ops/reciprocal/reciprocal.cc
-#include "infinicore/ops/reciprocal.hpp"
-#include "../../utils.hpp"
-
-namespace infinicore::op {
-
-common::OpDispatcher<Reciprocal::schema> &Reciprocal::dispatcher() {
-    static common::OpDispatcher<Reciprocal::schema> dispatcher_;
-    return dispatcher_;
-};
-
-void Reciprocal::execute(Tensor y, Tensor x) {
-    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(y, x);
-    infinicore::context::setDevice(y->device());
-    dispatcher().lookup(y->device().getType())(y, x);
-}
-
-Tensor reciprocal(Tensor x) {
-    auto y = Tensor::empty(x->shape(), x->dtype(), x->device());
-    reciprocal_(y, x);
-    return y;
-}
-
-void reciprocal_(Tensor y, Tensor x) {
-    Reciprocal::execute(y, x);
-}
-
-} // namespace infinicore::op
--- a/src/infinicore/ops/reciprocal/reciprocal_infiniop.cc
+++ b/src/infinicore/ops/reciprocal/reciprocal_infiniop.cc
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/common/cache.hpp"
-#include "infinicore/ops/reciprocal.hpp"
-#include <infiniop.h>
-
-namespace infinicore::op::reciprocal_impl::infiniop {
-
-thread_local common::OpCache<size_t, infiniopReciprocalDescriptor_t> caches(
-    100, // capacity
-    [](infiniopReciprocalDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyReciprocalDescriptor(desc));
-            desc = nullptr;
-        }
-    });
-
-void calculate(Tensor y, Tensor x) {
-    size_t seed = hash_combine(y, x);
-
-    auto device = context::getDevice();
-    auto &cache = caches.getCache(device);
-
-    auto desc_opt = cache.get(seed);
-    infiniopReciprocalDescriptor_t desc = nullptr;
-
-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateReciprocalDescriptor(
-            context::getInfiniopHandle(device), &desc,
-            y->desc(), x->desc()));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
-
-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetReciprocalWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
-
-    INFINICORE_CHECK_ERROR(infiniopReciprocal(
-        desc, workspace->data(), workspace_size,
-        y->data(), x->data(), context::getStream()));
-}
-
-static bool registered = []() {
-    Reciprocal::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
-
-} // namespace infinicore::op::reciprocal_impl::infiniop
--- a/src/infinicore/pybind11/ops.hpp
+++ b/src/infinicore/pybind11/ops.hpp
@@ -4,14 +4,10 @@

 #include "ops/add.hpp"
 #include "ops/add_rms_norm.hpp"
-#include "ops/addcmul.hpp"
 #include "ops/all.hpp"
-#include "ops/atanh.hpp"
 #include "ops/attention.hpp"
 #include "ops/avg_pool1d.hpp"
-#include "ops/binary_cross_entropy_with_logits.hpp"
 #include "ops/causal_softmax.hpp"
-#include "ops/cdist.hpp"
 #include "ops/cross_entropy.hpp"
 #include "ops/embedding.hpp"
 #include "ops/equal.hpp"
@@ -30,7 +26,6 @@
 #include "ops/paged_caching.hpp"
 #include "ops/random_sample.hpp"
 #include "ops/rearrange.hpp"
-#include "ops/reciprocal.hpp"
 #include "ops/rms_norm.hpp"
 #include "ops/rope.hpp"
 #include "ops/silu.hpp"
@@ -79,11 +74,6 @@ inline void bind(py::module &m) {
    bind_topk(m);
    bind_all(m);
    bind_equal(m);
-    bind_atanh(m);
-    bind_addcmul(m);
-    bind_cdist(m);
-    bind_binary_cross_entropy_with_logits(m);
-    bind_reciprocal(m);
 }

 } // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/addcmul.hpp
+++ b/src/infinicore/pybind11/ops/addcmul.hpp
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-#include "infinicore/ops/addcmul.hpp"
-
-namespace py = pybind11;
-
-namespace infinicore::ops {
-
-inline void bind_addcmul(py::module &m) {
-    // 绑定 out-of-place 接口: out = addcmul(input, t1, t2, value)
-    m.def("addcmul",
-          &op::addcmul,
-          py::arg("input"),
-          py::arg("tensor1"),
-          py::arg("tensor2"),
-          py::arg("value") = 1.0f,
-          R"doc(Performs the element-wise multiplication of tensor1 by tensor2, 
-multiplies the result by value and adds it to input.
-
-Args:
-    input: Tensor to be added
-    tensor1: First tensor for multiplication
-    tensor2: Second tensor for multiplication
-    value: Scalar multiplier for tensor1 * tensor2 (default: 1.0)
-
-Returns:
-    The output tensor
-)doc");
-
-    // 绑定 in-place / specified output 接口: addcmul_(out, input, t1, t2, value)
-    m.def("addcmul_",
-          &op::addcmul_,
-          py::arg("out"),
-          py::arg("input"),
-          py::arg("tensor1"),
-          py::arg("tensor2"),
-          py::arg("value") = 1.0f,
-          R"doc(In-place version of addcmul.
-
-Args:
-    out: The destination tensor to store the result
-    input: Tensor to be added
-    tensor1: First tensor for multiplication
-    tensor2: Second tensor for multiplication
-    value: Scalar multiplier for tensor1 * tensor2 (default: 1.0)
-)doc");
-}
-
-} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/atanh.hpp
+++ b/src/infinicore/pybind11/ops/atanh.hpp
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-#include "infinicore/ops/atanh.hpp"
-
-namespace py = pybind11;
-
-namespace infinicore::ops {
-
-inline void bind_atanh(py::module &m) {
-    m.def("atanh",
-          &op::atanh,
-          py::arg("a"),
-          R"doc(Inverse hyperbolic tangent of a tensor.)doc");
-
-    m.def("atanh_",
-          &op::atanh_,
-          py::arg("y"),
-          py::arg("a"),
-          R"doc(Compute inverse hyperbolic tangent and store in the provided output tensor.)doc");
-}
-
-} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/binary_cross_entropy_with_logits.hpp
+++ b/src/infinicore/pybind11/ops/binary_cross_entropy_with_logits.hpp
-#pragma once
-
-#include "infinicore/ops/binary_cross_entropy_with_logits.hpp"
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace infinicore::ops {
-
-inline void bind_binary_cross_entropy_with_logits(py::module &m) {
-    // 1. 绑定 out-of-place 接口: out = binary_cross_entropy_with_logits(...)
-    m.def(
-        "binary_cross_entropy_with_logits",
-        [](Tensor logits,
-           Tensor target,
-           py::object weight,
-           py::object pos_weight,
-           std::string reduction) {
-            Tensor w = weight.is_none() ? Tensor() : weight.cast<Tensor>();
-            Tensor pw = pos_weight.is_none() ? Tensor() : pos_weight.cast<Tensor>();
-
-            return op::binary_cross_entropy_with_logits(
-                logits, target, w, pw, reduction);
-        },
-        py::arg("input"),
-        py::arg("target"),
-        py::arg("weight") = py::none(),
-        py::arg("pos_weight") = py::none(),
-        py::arg("reduction") = "mean",
-        R"doc(Measures Binary Cross Entropy between target and output logits.
-
-    
-Args:
-    input: Tensor of arbitrary shape as unnormalized scores (logits).
-    target: Tensor of the same shape as input with values between 0 and 1.
-    weight: Optional rescaling weight for each loss component.
-    pos_weight: Optional weight for positive examples (must be broadcastable).
-    reduction: Specfies the reduction to apply: 'none' | 'mean' | 'sum'.
-
-Returns:
-    A tensor representing the loss.
-)doc");
-
-    // 2. 绑定指定输出接口: binary_cross_entropy_with_logits_(out, ...)
-    m.def(
-        "binary_cross_entropy_with_logits_",
-        [](Tensor output,
-           Tensor logits,
-           Tensor target,
-           py::object weight,
-           py::object pos_weight,
-           std::string reduction) {
-            Tensor w = weight.is_none() ? Tensor() : weight.cast<Tensor>();
-            Tensor pw = pos_weight.is_none() ? Tensor() : pos_weight.cast<Tensor>();
-
-            return op::binary_cross_entropy_with_logits_(
-                output, logits, target, w, pw, reduction);
-        },
-        py::arg("out"),
-        py::arg("input"),
-        py::arg("target"),
-        py::arg("weight") = py::none(),
-        py::arg("pos_weight") = py::none(),
-        py::arg("reduction") = "mean",
-        R"doc(Specified output version of binary_cross_entropy_with_logits.
-
-Args:
-    out: The destination tensor to store the loss.
-    input: Logits tensor.
-    target: Target tensor.
-    weight: Optional sample weight.
-    pos_weight: Optional positive class weight.
-    reduction: Specfies the reduction to apply.
-)doc");
-}
-
-} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/cdist.hpp
+++ b/src/infinicore/pybind11/ops/cdist.hpp
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-#include "infinicore/ops/cdist.hpp"
-
-namespace py = pybind11;
-
-namespace infinicore::ops {
-
-inline void bind_cdist(py::module &m) {
-    // 1. 绑定 out-of-place 接口: out = cdist(x1, x2, p)
-    m.def("cdist",
-          &op::cdist,
-          py::arg("x1"),
-          py::arg("x2"),
-          py::arg("p") = 2.0,
-          R"doc(Computes batched pairwise distance between vectors in x1 and x2 using p-norm.
-
-Args:
-    x1: First set of vectors, shape (M, D)
-    x2: Second set of vectors, shape (N, D)
-    p: The p-norm to apply (default: 2.0)
-
-Returns:
-    A matrix containing pairwise distances, shape (M, N)
-)doc");
-
-    // 2. 绑定 in-place / specified output 接口: cdist_(out, x1, x2, p)
-    m.def("cdist_",
-          &op::cdist_,
-          py::arg("out"),
-          py::arg("x1"),
-          py::arg("x2"),
-          py::arg("p") = 2.0,
-          R"doc(In-place version of cdist. Stores the results in the 'out' tensor.
-
-Args:
-    out: The destination tensor, shape (M, N)
-    x1: First set of vectors, shape (M, D)
-    x2: Second set of vectors, shape (N, D)
-    p: The p-norm to apply (default: 2.0)
-)doc");
-}
-
-} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/reciprocal.hpp
+++ b/src/infinicore/pybind11/ops/reciprocal.hpp
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-#include "infinicore/ops/reciprocal.hpp"
-
-namespace py = pybind11;
-
-namespace infinicore::ops {
-
-inline void bind_reciprocal(py::module &m) {
-    m.def("reciprocal",
-          &op::reciprocal,
-          py::arg("x"),
-          R"doc(Computes the reciprocal of the input tensor.)doc");
-
-    m.def("reciprocal_",
-          &op::reciprocal_,
-          py::arg("y"),
-          py::arg("x"),
-          R"doc(Computes the reciprocal of the input tensor and stores in the output tensor.)doc");
-}
-
-} // namespace infinicore::ops
--- a/src/infiniop-test/include/ops.hpp
+++ b/src/infiniop-test/include/ops.hpp
@@ -13,11 +13,6 @@ DECLARE_INFINIOP_TEST(rope)
 DECLARE_INFINIOP_TEST(clip)
 DECLARE_INFINIOP_TEST(swiglu)
 DECLARE_INFINIOP_TEST(add)
-DECLARE_INFINIOP_TEST(atanh)
-DECLARE_INFINIOP_TEST(addcmul)
-DECLARE_INFINIOP_TEST(cdist)
-DECLARE_INFINIOP_TEST(binary_cross_entropy_with_logits)
-DECLARE_INFINIOP_TEST(reciprocal)
 DECLARE_INFINIOP_TEST(causal_softmax)
 DECLARE_INFINIOP_TEST(rearrange)
 DECLARE_INFINIOP_TEST(silu)

--- a/src/infiniop-test/src/ops/addcmul.cpp
+++ b/src/infiniop-test/src/ops/addcmul.cpp
-#include "ops.hpp"
-#include "utils.hpp"
-#include <infinirt.h>
-#include <iomanip>
-#include <iostream>
-
-namespace infiniop_test::addcmul {
-
-struct Test::Attributes {
-    std::shared_ptr<Tensor> input;
-    std::shared_ptr<Tensor> t1;
-    std::shared_ptr<Tensor> t2;
-    std::shared_ptr<Tensor> out;
-    std::shared_ptr<Tensor> ans;
-    float value;
-};
-
-std::shared_ptr<Test> Test::build(
-    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
-    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
-
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
-    test->_attributes = new Attributes();
-
-    // 校验张量是否存在
-    if (tensors.find("input") == tensors.end() || tensors.find("t1") == tensors.end() || tensors.find("t2") == tensors.end() || tensors.find("out") == tensors.end() || tensors.find("ans") == tensors.end()) {
-        throw std::runtime_error("Invalid Addcmul Test: Missing tensors");
-    }
-
-    // 获取标量属性 value
-    test->_attributes->value = 1.0f; // 默认值
-    if (attributes.find("value") != attributes.end()) {
-        test->_attributes->value = *reinterpret_cast<float *>(attributes["value"].data());
-    }
-
-    test->_attributes->input = tensors["input"];
-    test->_attributes->t1 = tensors["t1"];
-    test->_attributes->t2 = tensors["t2"];
-    test->_attributes->out = tensors["out"];
-    test->_attributes->ans = tensors["ans"];
-
-    return test;
-}
-
-std::shared_ptr<infiniop_test::Result> Test::run(
-    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
-
-    infiniopAddcmulDescriptor_t op_desc;
-
-    // 数据迁移至指定设备
-    auto input = _attributes->input->to(device, device_id);
-    auto t1 = _attributes->t1->to(device, device_id);
-    auto t2 = _attributes->t2->to(device, device_id);
-    auto out = _attributes->out->to(device, device_id);
-
-    // 创建算子描述符
-    CHECK_OR(infiniopCreateAddcmulDescriptor(handle, &op_desc,
-                                             out->desc(),
-                                             input->desc(),
-                                             t1->desc(),
-                                             t2->desc(),
-                                             _attributes->value),
-             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create addcmul descriptor."));
-
-    // Workspace 处理
-    size_t workspace_size;
-    CHECK_OR(infiniopGetAddcmulWorkspaceSize(op_desc, &workspace_size),
-             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
-
-    void *workspace;
-    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
-             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
-
-    // 执行计算
-    CHECK_OR(infiniopAddcmul(op_desc, workspace, workspace_size,
-                             out->data(),
-                             input->data(),
-                             t1->data(),
-                             t2->data(),
-                             nullptr), // stream
-             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
-
-    // 结果验证
-    try {
-        allClose(out, _attributes->ans, _rtol, _atol);
-    } catch (const std::exception &e) {
-        return TEST_FAILED(RESULT_INCORRECT, e.what());
-    }
-
-    // 性能测试
-    double elapsed_time = benchmark(
-        [=]() {
-            infiniopAddcmul(op_desc, workspace, workspace_size,
-                            out->data(),
-                            input->data(),
-                            t1->data(),
-                            t2->data(),
-                            nullptr);
-        },
-        warm_ups, iterations);
-
-    // 资源清理
-    infinirtFree(workspace);
-    infiniopDestroyAddcmulDescriptor(op_desc);
-
-    return TEST_PASSED(elapsed_time);
-}
-
-std::vector<std::string> Test::attribute_names() {
-    return {"value"};
-}
-
-std::vector<std::string> Test::tensor_names() {
-    return {"input", "t1", "t2", "out", "ans"};
-}
-
-std::vector<std::string> Test::output_names() {
-    return {"out"};
-}
-
-std::string Test::toString() const {
-    std::ostringstream oss;
-    oss << op_name() << std::endl;
-    oss << "- value: " << _attributes->value << std::endl;
-    oss << "- input: " << _attributes->input->info() << std::endl;
-    oss << "- t1: " << _attributes->t1->info() << std::endl;
-    oss << "- t2: " << _attributes->t2->info() << std::endl;
-    oss << "- out: " << _attributes->out->info() << std::endl;
-    oss << std::scientific << std::setprecision(2);
-    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
-    return oss.str();
-}
-
-Test::~Test() {
-    delete _attributes;
-}
-
-} // namespace infiniop_test::addcmul
--- a/src/infiniop-test/src/ops/atanh.cpp
+++ b/src/infiniop-test/src/ops/atanh.cpp
-#include "ops.hpp"
-#include "utils.hpp"
-#include <infinirt.h>
-#include <iomanip>
-#include <iostream>
-
-namespace infiniop_test::atanh {
-struct Test::Attributes {
-    std::shared_ptr<Tensor> a;   // 输入
-    std::shared_ptr<Tensor> y;   // 输出
-    std::shared_ptr<Tensor> ans; // 参考结果
-};
-
-std::shared_ptr<Test> Test::build(
-    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
-    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
-
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
-    test->_attributes = new Attributes();
-
-    // atanh 只需要 a (input), y (output) 和 ans (reference)
-    if (tensors.find("a") == tensors.end()
-        || tensors.find("y") == tensors.end()
-        || tensors.find("ans") == tensors.end()) {
-        throw std::runtime_error("Invalid Atanh Test: Missing tensors.");
-    }
-
-    test->_attributes->a = tensors["a"];
-    test->_attributes->y = tensors["y"];
-    test->_attributes->ans = tensors["ans"];
-
-    return test;
-}
-
-std::shared_ptr<infiniop_test::Result> Test::run(
-    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
-
-    infiniopAtanhDescriptor_t op_desc;
-    auto a = _attributes->a->to(device, device_id);
-    auto y = _attributes->y->to(device, device_id);
-
-    // 调用修正后的 4 参数版本接口 (handle, desc, y, a)
-    CHECK_OR(infiniopCreateAtanhDescriptor(handle, &op_desc,
-                                           y->desc(),
-                                           a->desc()),
-             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create atanh descriptor."));
-
-    size_t workspace_size;
-    CHECK_OR(infiniopGetAtanhWorkspaceSize(op_desc, &workspace_size),
-             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
-
-    void *workspace;
-    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
-             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
-
-    // 执行计算 (移除 b 相关的参数)
-    CHECK_OR(infiniopAtanh(op_desc, workspace, workspace_size,
-                           y->data(),
-                           a->data(),
-                           nullptr),
-             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during atanh execution."));
-
-    // 验证结果
-    try {
-        allClose(y, _attributes->ans, _rtol, _atol);
-    } catch (const std::exception &e) {
-        return TEST_FAILED(RESULT_INCORRECT, e.what());
-    }
-
-    // 性能测试 (Benchmark)
-    double elapsed_time = 0.;
-    elapsed_time = benchmark(
-        [=]() {
-            infiniopAtanh(
-                op_desc, workspace, workspace_size,
-                y->data(),
-                a->data(),
-                nullptr);
-        },
-        warm_ups, iterations);
-
-    // 释放资源 (可选：根据框架决定是否在此释放 op_desc)
-    // infiniopDestroyAtanhDescriptor(op_desc);
-    // infinirtFree(workspace);
-
-    return TEST_PASSED(elapsed_time);
-}
-
-std::vector<std::string> Test::attribute_names() {
-    return {};
-}
-
-std::vector<std::string> Test::tensor_names() {
-    return {"a", "y", "ans"};
-}
-
-std::vector<std::string> Test::output_names() {
-    return {"y"};
-}
-
-std::string Test::toString() const {
-    std::ostringstream oss;
-    oss << op_name() << std::endl;
-    oss << "- a: " << _attributes->a->info() << std::endl;
-    oss << "- y: " << _attributes->y->info() << std::endl;
-    oss << std::scientific << std::setprecision(2);
-    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
-    return oss.str();
-}
-
-Test::~Test() {
-    delete _attributes;
-}
-
-} // namespace infiniop_test::atanh