Revert "Merge pull request #1056 from InfiniTensor/issue/1031"

This reverts commit 7f295448, reversing changes made to e60985dc.

Revert "Merge pull request #1056 from InfiniTensor/issue/1031"
This reverts commit 7f295448, reversing changes made to e60985dc.
cb7f0b7d · wooway777 · 037140c0 · 037140c0 · 037140c0 · 037140c0
Commit cb7f0b7d authored Mar 11, 2026 by wooway777
20 changed files
--- a/src/infinicore/ops/baddbmm/baddbmm.cc
+++ b/src/infinicore/ops/baddbmm/baddbmm.cc
-#include "infinicore/ops/baddbmm.hpp"
-#include "infinicore/ops/gemm.hpp"
-#include "infinicore/ops/rearrange.hpp"
-
-namespace infinicore::op {
-
-// 内联的 BLAS 兼容性检查，减少函数调用开销
-inline bool is_blas_compatible(const Tensor &t) {
-    const auto ndim = t->ndim();
-    if (ndim == 2) {
-        const auto rs = t->stride(0);
-        const auto cs = t->stride(1);
-        if (rs != 1 && cs != 1) {
-            return false;
-        }
-        if (rs == 1 && cs == 1) {
-            return t->shape()[0] == 1 || t->shape()[1] == 1;
-        }
-        return true;
-    } else if (ndim == 3) {
-        const auto rs = t->stride(1);
-        const auto cs = t->stride(2);
-        if (t->shape()[0] > 1 && t->stride(0) == 0) {
-            return false;
-        }
-        if (rs != 1 && cs != 1) {
-            return false;
-        }
-        if (rs == 1 && cs == 1) {
-            return t->shape()[1] == 1 || t->shape()[2] == 1;
-        }
-        return true;
-    }
-    return false;
-}
-
-inline void prepare_gemm_input(Tensor &output, Tensor &input, const size_t batch_size, const size_t m, const size_t n) {
-    const auto input_ndim = input->ndim();
-    if (input_ndim == 2) {
-        rearrange_(output, input->as_strided(
-                               {batch_size, m, n},
-                               {0, input->stride(0), input->stride(1)}));
-    } else if (input_ndim == 3 && input->shape()[0] == 1 && batch_size > 1) {
-        rearrange_(output, input->as_strided(
-                               {batch_size, m, n},
-                               {0, input->stride(1), input->stride(2)}));
-    } else {
-        rearrange_(output, input);
-    }
-}
-
-Tensor baddbmm(Tensor input, Tensor batch1, Tensor batch2,
-               float beta,
-               float alpha) {
-    const size_t batch_size = batch1->shape()[0];
-    const size_t m = batch1->shape()[1];
-    const size_t n = batch2->shape()[2];
-
-    const Tensor &a = is_blas_compatible(batch1) ? batch1 : rearrange(batch1);
-    const Tensor &b = is_blas_compatible(batch2) ? batch2 : rearrange(batch2);
-
-    if (beta == 0.0f) {
-        return gemm(a, b, alpha, 0.0f);
-    }
-
-    Tensor result = Tensor::empty({batch_size, m, n}, a->dtype(), a->device());
-
-    prepare_gemm_input(result, input, batch_size, m, n);
-
-    gemm_(result, a, b, alpha, beta);
-    return result;
-}
-
-void baddbmm_(Tensor out, Tensor input, Tensor batch1, Tensor batch2,
-              float beta,
-              float alpha) {
-    const size_t batch_size = batch1->shape()[0];
-    const size_t m = batch1->shape()[1];
-    const size_t n = batch2->shape()[2];
-
-    const Tensor &a = is_blas_compatible(batch1) ? batch1 : rearrange(batch1);
-    const Tensor &b = is_blas_compatible(batch2) ? batch2 : rearrange(batch2);
-
-    const bool out_is_usable = out->is_contiguous() && out->ndim() == 3 && out->shape()[0] == batch_size && out->shape()[1] == m && out->shape()[2] == n;
-
-    if (out_is_usable) {
-        if (beta != 0.0f && input->data() != out->data()) {
-            prepare_gemm_input(out, input, batch_size, m, n);
-        }
-        gemm_(out, a, b, alpha, beta);
-    } else {
-        Tensor result = Tensor::empty({batch_size, m, n}, a->dtype(), a->device());
-        if (beta != 0.0f) {
-            prepare_gemm_input(result, input, batch_size, m, n);
-        }
-        gemm_(result, a, b, alpha, beta);
-        rearrange_(out, result);
-    }
-}
-} // namespace infinicore::op
--- a/src/infinicore/ops/bilinear/bilinear.cc
+++ b/src/infinicore/ops/bilinear/bilinear.cc
-#include "infinicore/ops/bilinear.hpp"
-#include "infinicore/ops/add.hpp"
-#include "infinicore/ops/matmul.hpp"
-#include "infinicore/ops/rearrange.hpp"
-
-#ifdef ENABLE_NVIDIA_API
-namespace op::gemm::nvidia {
-void set_tf32_enabled(bool);
-}
-#endif
-
-namespace infinicore::op {
-
-namespace {
-// RAII 守卫：作用域内禁用 TF32
-struct ScopedTF32Disable {
-    ScopedTF32Disable() {
-#ifdef ENABLE_NVIDIA_API
-        // 实际项目中建议添加检查，仅在 NVIDIA 设备上调用
-        // 使用 ::op 强制从全局命名空间查找，避免被当前的 infinicore::op 遮蔽
-        ::op::gemm::nvidia::set_tf32_enabled(false);
-#endif
-    }
-    ~ScopedTF32Disable() {
-#ifdef ENABLE_NVIDIA_API
-        ::op::gemm::nvidia::set_tf32_enabled(true);
-#endif
-    }
-};
-
-inline bool is_gemm_compatible_3d(const Tensor &t) {
-    if (t->ndim() != 3) {
-        return false;
-    }
-
-    const auto batch = t->shape()[0];
-    const auto rows = t->shape()[1];
-    const auto cols = t->shape()[2];
-    const auto bs = t->stride(0);
-    const auto rs = t->stride(1);
-    const auto cs = t->stride(2);
-
-    if (rs != 1 && cs != 1) {
-        return false;
-    }
-
-    if (cs == 1) {
-        if (rs < static_cast<int64_t>(cols)) {
-            return false;
-        }
-    } else {
-        if (cs < static_cast<int64_t>(rows)) {
-            return false;
-        }
-    }
-
-    if (batch > 1 && bs == 0) {
-        return false;
-    }
-
-    return true;
-}
-
-inline Tensor ensure_gemm_compatible(const Tensor &t) {
-    if (t->ndim() == 2) {
-        return t->is_contiguous() ? t : rearrange(t);
-    } else if (t->ndim() == 3) {
-        return is_gemm_compatible_3d(t) ? t : rearrange(t);
-    }
-    return t->is_contiguous() ? t : rearrange(t);
-}
-
-} // anonymous namespace
-
-Tensor bilinear(Tensor x1, Tensor x2, Tensor weight, std::optional<Tensor> bias) {
-    ScopedTF32Disable tf32_guard;
-
-    const size_t batch_size = x1->shape()[0];
-    const size_t in1_features = x1->shape()[1];
-    const size_t in2_features = x2->shape()[1];
-    const size_t out_features = weight->shape()[0];
-
-    Tensor x1_compat = ensure_gemm_compatible(x1);
-    Tensor x2_compat = ensure_gemm_compatible(x2);
-    Tensor weight_cont = weight->is_contiguous() ? weight : weight->contiguous();
-
-    Tensor weight_permuted = weight_cont->permute({1, 0, 2});
-    Tensor weight_permuted_cont = weight_permuted->is_contiguous()
-                                    ? weight_permuted
-                                    : weight_permuted->contiguous();
-    Tensor weight_matrix = weight_permuted_cont->view({in1_features, out_features * in2_features});
-
-    Tensor intermediate = matmul(x1_compat, weight_matrix, 1.0f);
-
-    Tensor intermediate_3d = intermediate->view({batch_size, out_features, in2_features});
-    Tensor intermediate_transposed = intermediate_3d->permute({0, 2, 1});
-    Tensor intermediate_compat = ensure_gemm_compatible(intermediate_transposed);
-
-    Tensor x2_row = x2_compat->view({batch_size, 1, in2_features});
-    Tensor x2_row_compat = ensure_gemm_compatible(x2_row);
-
-    Tensor out_3d = matmul(x2_row_compat, intermediate_compat, 1.0f);
-    Tensor out = out_3d->view({batch_size, out_features});
-
-    if (bias) {
-        Tensor bias_broadcast = (*bias)->as_strided(
-            {batch_size, out_features},
-            {0, (*bias)->strides()[0]});
-        out = add(out, bias_broadcast);
-    }
-    return out;
-}
-
-void bilinear_(Tensor out, Tensor x1, Tensor x2, Tensor weight, std::optional<Tensor> bias) {
-    Tensor result = bilinear(x1, x2, weight, bias);
-    rearrange_(out, result);
-}
-
-} // namespace infinicore::op
--- a/src/infinicore/ops/cross_entropy/cross_entropy.cc
+++ b/src/infinicore/ops/cross_entropy/cross_entropy.cc
-#include "infinicore/ops/cross_entropy.hpp"
-
-#include "../../utils.hpp"
-
-#include <stdexcept>
-
-namespace infinicore::op {
-
-common::OpDispatcher<CrossEntropy::schema> &CrossEntropy::dispatcher() {
-    static common::OpDispatcher<CrossEntropy::schema> dispatcher_;
-    return dispatcher_;
-};
-
-void CrossEntropy::execute(Tensor output, Tensor input, Tensor target) {
-
-    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input);
-    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(input, target);
-
-    infinicore::context::setDevice(output->device());
-    auto device_type = output->device().getType();
-
-    auto func = dispatcher().lookup(device_type);
-
-    if (func == nullptr) {
-        throw std::runtime_error("No CrossEntropy implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
-    }
-
-    func(output, input, target);
-}
-
-Tensor cross_entropy(Tensor input, Tensor target) {
-
-    Shape shape = target->shape();
-
-    auto output = Tensor::empty(shape, input->dtype(), input->device());
-
-    cross_entropy_(output, input, target);
-    return output;
-}
-
-void cross_entropy_(Tensor output, Tensor input, Tensor target) {
-    CrossEntropy::execute(output, input, target);
-}
-
-} // namespace infinicore::op
--- a/src/infinicore/ops/cross_entropy/cross_entropy_infiniop.cc
+++ b/src/infinicore/ops/cross_entropy/cross_entropy_infiniop.cc
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-
-#include "infinicore/ops/common/cache.hpp"
-#include "infinicore/ops/cross_entropy.hpp"
-
-#include <infiniop.h>
-
-namespace infinicore::op::cross_entropy_impl::infiniop {
-
-thread_local common::OpCache<size_t, infiniopCrossEntropyDescriptor_t> caches(
-    100,
-    [](infiniopCrossEntropyDescriptor_t &desc) {
-        if (desc != nullptr) {
-
-            INFINICORE_CHECK_ERROR(infiniopDestroyCrossEntropyDescriptor(desc));
-            desc = nullptr;
-        }
-    });
-
-void calculate(Tensor output, Tensor input, Tensor target) {
-
-    size_t seed = hash_combine(output, input, target);
-
-    auto device = context::getDevice();
-    auto &cache = caches.getCache(device);
-
-    auto desc_opt = cache.get(seed);
-    infiniopCrossEntropyDescriptor_t desc = nullptr;
-
-    if (!desc_opt) {
-
-        INFINICORE_CHECK_ERROR(infiniopCreateCrossEntropyDescriptor(
-            context::getInfiniopHandle(device),
-            &desc,
-            output->desc(),
-            input->desc(),
-            target->desc()));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
-
-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetCrossEntropyWorkspaceSize(desc, &workspace_size));
-
-    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
-
-    INFINICORE_CHECK_ERROR(infiniopCrossEntropy(
-        desc,
-        workspace->data(),
-        workspace_size,
-        output->data(),
-        input->data(),
-        target->data(),
-        context::getStream()));
-}
-
-static bool registered = []() {
-    CrossEntropy::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
-
-} // namespace infinicore::op::cross_entropy_impl::infiniop
--- a/src/infinicore/ops/equal/equal.cc
+++ b/src/infinicore/ops/equal/equal.cc
-#include "infinicore/ops/equal.hpp"
-
-#include "../../utils.hpp"
-
-namespace infinicore::op {
-
-common::OpDispatcher<Equal::schema> &Equal::dispatcher() {
-    static common::OpDispatcher<Equal::schema> dispatcher_;
-    return dispatcher_;
-};
-
-void Equal::execute(Tensor out, Tensor a, Tensor b) {
-    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, a, b);
-    infinicore::context::setDevice(out->device());
-    dispatcher().lookup(out->device().getType())(out, a, b);
-}
-
-Tensor equal(Tensor a, Tensor b) {
-    auto out = Tensor::empty(a->shape(), DataType::BOOL, a->device());
-    equal_(out, a, b);
-    return out;
-}
-
-void equal_(Tensor out, Tensor a, Tensor b) {
-    if (out->dtype() != DataType::BOOL) {
-        throw std::runtime_error("Equal expects bool output tensor.");
-    }
-    Equal::execute(out, a, b);
-}
-
-} // namespace infinicore::op
--- a/src/infinicore/ops/equal/equal_infiniop.cc
+++ b/src/infinicore/ops/equal/equal_infiniop.cc
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/common/cache.hpp"
-#include "infinicore/ops/equal.hpp"
-#include <infiniop.h>
-
-namespace infinicore::op::equal_impl::infiniop {
-
-thread_local common::OpCache<size_t, infiniopEqualDescriptor_t> caches(
-    100,
-    [](infiniopEqualDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyEqualDescriptor(desc));
-            desc = nullptr;
-        }
-    });
-
-void calculate(Tensor out, Tensor a, Tensor b) {
-    size_t seed = hash_combine(out, a, b);
-    auto device = context::getDevice();
-    auto &cache = caches.getCache(device);
-
-    infiniopEqualDescriptor_t desc = nullptr;
-    if (auto cached = cache.get(seed)) {
-        desc = *cached;
-    } else {
-        INFINICORE_CHECK_ERROR(infiniopCreateEqualDescriptor(
-            context::getInfiniopHandle(device), &desc,
-            out->desc(), a->desc(), b->desc()));
-        cache.put(seed, desc);
-    }
-
-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetEqualWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace;
-    void *workspace_ptr = nullptr;
-    if (workspace_size != 0) {
-        workspace = context::allocateMemory(workspace_size);
-        workspace_ptr = workspace->data();
-    }
-
-    INFINICORE_CHECK_ERROR(infiniopEqual(
-        desc,
-        workspace_ptr,
-        workspace_size,
-        out->data(),
-        a->data(),
-        b->data(),
-        context::getStream()));
-}
-
-static bool registered = []() {
-    Equal::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
-
-} // namespace infinicore::op::equal_impl::infiniop
--- a/src/infinicore/ops/fmod/fmod.cc
+++ b/src/infinicore/ops/fmod/fmod.cc
-#include "infinicore/ops/fmod.hpp"
-
-#include "../../utils.hpp"
-
-namespace infinicore::op {
-
-common::OpDispatcher<Fmod::schema> &Fmod::dispatcher() {
-    static common::OpDispatcher<Fmod::schema> dispatcher_;
-    return dispatcher_;
-};
-
-void Fmod::execute(Tensor c, Tensor a, Tensor b) {
-    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a, b);
-    infinicore::context::setDevice(c->device());
-    dispatcher().lookup(c->device().getType())(c, a, b);
-}
-
-Tensor fmod(Tensor a, Tensor b) {
-    auto c = Tensor::empty(a->shape(), a->dtype(), a->device());
-    fmod_(c, a, b);
-    return c;
-}
-
-void fmod_(Tensor c, Tensor a, Tensor b) {
-    Fmod::execute(c, a, b);
-}
-
-} // namespace infinicore::op
--- a/src/infinicore/ops/fmod/fmod_infiniop.cc
+++ b/src/infinicore/ops/fmod/fmod_infiniop.cc
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/common/cache.hpp"
-#include "infinicore/ops/fmod.hpp"
-#include <infiniop.h>
-
-namespace infinicore::op::fmod_impl::infiniop {
-
-thread_local common::OpCache<size_t, infiniopFmodDescriptor_t> caches(
-    100, // capacity
-    [](infiniopFmodDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyFmodDescriptor(desc));
-            desc = nullptr;
-        }
-    });
-
-void calculate(Tensor c, Tensor a, Tensor b) {
-    size_t seed = hash_combine(c, b, a);
-
-    auto device_type = context::getDevice().getType();
-    auto device_index = context::getDevice().getIndex();
-
-    auto &cache = caches.getCache(device_type, device_index);
-
-    auto desc_opt = cache.get(seed);
-    infiniopFmodDescriptor_t desc = nullptr;
-
-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateFmodDescriptor(
-            context::getInfiniopHandle(c->device()), &desc,
-            c->desc(), a->desc(), b->desc()));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
-
-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetFmodWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
-
-    INFINICORE_CHECK_ERROR(infiniopFmod(
-        desc, workspace->data(), workspace_size,
-        c->data(), a->data(), b->data(), context::getStream()));
-}
-
-static bool registered = []() {
-    Fmod::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
-
-} // namespace infinicore::op::fmod_impl::infiniop
--- a/src/infinicore/ops/hardswish/hardswish.cc
+++ b/src/infinicore/ops/hardswish/hardswish.cc
-#include "infinicore/ops/hardswish.hpp"
-
-#include "../../utils.hpp"
-
-#include <stdexcept>
-
-namespace infinicore::op {
-
-common::OpDispatcher<Hardswish::schema> &Hardswish::dispatcher() {
-    static common::OpDispatcher<Hardswish::schema> dispatcher_;
-    return dispatcher_;
-}
-
-void Hardswish::execute(Tensor output, Tensor input) {
-    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input);
-    infinicore::context::setDevice(output->device());
-    auto device_type = output->device().getType();
-    auto func = dispatcher().lookup(device_type);
-
-    if (func == nullptr) {
-        throw std::runtime_error(
-            "No Hardswish implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
-    }
-
-    func(output, input);
-}
-
-Tensor hardswish(Tensor input) {
-    auto output = Tensor::empty(input->shape(), input->dtype(), input->device());
-    hardswish_(output, input);
-    return output;
-}
-
-void hardswish_(Tensor output, Tensor input) {
-    Hardswish::execute(output, input);
-}
-
-} // namespace infinicore::op
--- a/src/infinicore/ops/hardswish/hardswish_infiniop.cc
+++ b/src/infinicore/ops/hardswish/hardswish_infiniop.cc
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/common/cache.hpp"
-#include "infinicore/ops/hardswish.hpp"
-#include <infiniop.h>
-
-namespace infinicore::op::hardswish_impl::infiniop {
-
-thread_local common::OpCache<size_t, infiniopHardSwishDescriptor_t> caches(
-    100,
-    [](infiniopHardSwishDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyHardSwishDescriptor(desc));
-            desc = nullptr;
-        }
-    });
-
-void calculate(Tensor output, Tensor input) {
-    size_t seed = hash_combine(output, input);
-
-    auto device = context::getDevice();
-    auto &cache = caches.getCache(device);
-
-    auto desc_opt = cache.get(seed);
-    infiniopHardSwishDescriptor_t desc = nullptr;
-
-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateHardSwishDescriptor(
-            context::getInfiniopHandle(device),
-            &desc,
-            output->desc(),
-            input->desc()));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
-
-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetHardSwishWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace;
-    void *workspace_ptr = nullptr;
-    if (workspace_size != 0) {
-        workspace = context::allocateMemory(workspace_size);
-        workspace_ptr = workspace->data();
-    }
-
-    INFINICORE_CHECK_ERROR(infiniopHardSwish(
-        desc,
-        workspace_ptr,
-        workspace_size,
-        output->data(),
-        input->data(),
-        context::getStream()));
-}
-
-static bool registered = []() {
-    Hardswish::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
-
-} // namespace infinicore::op::hardswish_impl::infiniop
--- a/src/infinicore/ops/hardtanh/hardtanh.cc
+++ b/src/infinicore/ops/hardtanh/hardtanh.cc
-#include "infinicore/ops/hardtanh.hpp"
-
-#include "../../utils.hpp"
-
-#include <stdexcept>
-
-namespace infinicore::op {
-
-common::OpDispatcher<HardTanh::schema> &HardTanh::dispatcher() {
-    static common::OpDispatcher<HardTanh::schema> dispatcher_;
-    return dispatcher_;
-}
-
-void HardTanh::execute(Tensor output, Tensor input, float min_val, float max_val) {
-    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input);
-    infinicore::context::setDevice(output->device());
-
-    auto device_type = output->device().getType();
-    auto func = dispatcher().lookup(device_type);
-    if (func == nullptr) {
-        throw std::runtime_error(
-            "No HardTanh implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
-    }
-
-    func(output, input, min_val, max_val);
-}
-
-Tensor hardtanh(Tensor input, float min_val, float max_val) {
-    auto output = Tensor::empty(input->shape(), input->dtype(), input->device());
-    hardtanh_(output, input, min_val, max_val);
-    return output;
-}
-
-void hardtanh_(Tensor output, Tensor input, float min_val, float max_val) {
-    HardTanh::execute(output, input, min_val, max_val);
-}
-
-} // namespace infinicore::op
--- a/src/infinicore/ops/hardtanh/hardtanh_infiniop.cc
+++ b/src/infinicore/ops/hardtanh/hardtanh_infiniop.cc
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/common/cache.hpp"
-#include "infinicore/ops/hardtanh.hpp"
-#include <infiniop.h>
-
-namespace infinicore::op::hardtanh_impl::infiniop {
-
-thread_local common::OpCache<size_t, infiniopHardTanhDescriptor_t> caches(
-    100,
-    [](infiniopHardTanhDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyHardTanhDescriptor(desc));
-            desc = nullptr;
-        }
-    });
-
-void calculate(Tensor output, Tensor input, float min_val, float max_val) {
-    size_t seed = hash_combine(output, input, min_val, max_val);
-
-    auto device = context::getDevice();
-    auto &cache = caches.getCache(device);
-
-    auto desc_opt = cache.get(seed);
-    infiniopHardTanhDescriptor_t desc = nullptr;
-
-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateHardTanhDescriptor(
-            context::getInfiniopHandle(device),
-            &desc,
-            output->desc(),
-            input->desc(),
-            min_val,
-            max_val));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
-
-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetHardTanhWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace;
-    void *workspace_ptr = nullptr;
-    if (workspace_size != 0) {
-        workspace = context::allocateMemory(workspace_size);
-        workspace_ptr = workspace->data();
-    }
-
-    INFINICORE_CHECK_ERROR(infiniopHardTanh(
-        desc,
-        workspace_ptr,
-        workspace_size,
-        output->data(),
-        input->data(),
-        context::getStream()));
-}
-
-static bool registered = []() {
-    HardTanh::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
-
-} // namespace infinicore::op::hardtanh_impl::infiniop
--- a/src/infinicore/pybind11/ops.hpp
+++ b/src/infinicore/pybind11/ops.hpp
@@ -2,23 +2,13 @@

 #include <pybind11/pybind11.h>

-#include "ops/adaptive_max_pool1d.hpp"
 #include "ops/add.hpp"
 #include "ops/add_rms_norm.hpp"
 #include "ops/all.hpp"
-#include "ops/asinh.hpp"
 #include "ops/attention.hpp"
-#include "ops/avg_pool1d.hpp"
-#include "ops/baddbmm.hpp"
-#include "ops/bilinear.hpp"
 #include "ops/causal_softmax.hpp"
-#include "ops/cross_entropy.hpp"
 #include "ops/embedding.hpp"
-#include "ops/equal.hpp"
 #include "ops/flash_attention.hpp"
-#include "ops/fmod.hpp"
-#include "ops/hardswish.hpp"
-#include "ops/hardtanh.hpp"
 #include "ops/kv_caching.hpp"
 #include "ops/linear.hpp"
 #include "ops/linear_w8a8i8.hpp"
@@ -45,39 +35,28 @@ namespace py = pybind11;
 namespace infinicore::ops {

 inline void bind(py::module &m) {
-    bind_adaptive_max_pool1d(m);
    bind_add(m);
    bind_add_rms_norm(m);
    bind_attention(m);
-    bind_asinh(m);
-    bind_baddbmm(m);
-    bind_bilinear(m);
    bind_causal_softmax(m);
    bind_flash_attention(m);
    bind_kv_caching(m);
-    bind_fmod(m);
-    bind_random_sample(m);
    bind_linear(m);
    bind_matmul(m);
    bind_mul(m);
    bind_mha_varlen(m);
-    bind_hardswish(m);
-    bind_hardtanh(m);
    bind_paged_attention(m);
    bind_paged_attention_prefill(m);
    bind_paged_caching(m);
    bind_random_sample(m);
-    bind_cross_entropy(m);
    bind_rearrange(m);
    bind_rms_norm(m);
-    bind_avg_pool1d(m);
    bind_silu(m);
    bind_swiglu(m);
    bind_rope(m);
    bind_embedding(m);
    bind_linear_w8a8i8(m);
    bind_silu_and_mul(m);
-    bind_equal(m);
    bind_sum(m);
    bind_var_mean(m);
    bind_var(m);

--- a/src/infinicore/pybind11/ops/adaptive_max_pool1d.hpp
+++ b/src/infinicore/pybind11/ops/adaptive_max_pool1d.hpp
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-#include "infinicore/ops/adaptive_max_pool1d.hpp"
-
-namespace py = pybind11;
-
-namespace infinicore::ops {
-
-inline void bind_adaptive_max_pool1d(py::module &m) {
-    m.def("adaptive_max_pool1d",
-          &op::adaptive_max_pool1d,
-          py::arg("x"),
-          py::arg("output_size"),
-          R"doc(1D Adaptive Max Pooling.
-
-Args:
-    x: Input tensor of shape (N, C, L_in) or (N, L_in)
-    output_size: Target output size L_out
-Returns:
-    Output tensor of shape (N, C, L_out) or (N, L_out)
-)doc");
-
-    m.def("adaptive_max_pool1d_",
-          &op::adaptive_max_pool1d_,
-          py::arg("y"),
-          py::arg("x"),
-          py::arg("output_size"),
-          R"doc(In-place 1D Adaptive Max Pooling.
-
-Args:
-    y: Output tensor of shape (N, C, L_out) or (N, L_out)
-    x: Input tensor of shape (N, C, L_in) or (N, L_in)
-    output_size: Target output size L_out
-)doc");
-}
-
-} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/asinh.hpp
+++ b/src/infinicore/pybind11/ops/asinh.hpp
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-#include "infinicore/ops/asinh.hpp"
-
-namespace py = pybind11;
-
-namespace infinicore::ops {
-
-inline void bind_asinh(py::module &m) {
-    m.def("asinh",
-          &op::asinh,
-          py::arg("x"),
-          R"doc(Element-wise inverse hyperbolic sine function.)doc");
-
-    m.def("asinh_",
-          &op::asinh_,
-          py::arg("y"),
-          py::arg("x"),
-          R"doc(In-place element-wise inverse hyperbolic sine function.)doc");
-}
-
-} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/avg_pool1d.hpp
+++ b/src/infinicore/pybind11/ops/avg_pool1d.hpp
-#pragma once
-
-#include <optional>
-#include <pybind11/pybind11.h>
-
-#include "infinicore/ops/avg_pool1d.hpp"
-
-namespace py = pybind11;
-
-namespace infinicore::ops {
-
-inline void bind_avg_pool1d(py::module &m) {
-    m.def(
-        "avg_pool1d",
-        [](::infinicore::Tensor input, size_t kernel_size, std::optional<size_t> stride, size_t padding) {
-            return op::avg_pool1d(input, kernel_size, stride.value_or(0), padding);
-        },
-        py::arg("input"),
-        py::arg("kernel_size"),
-        py::arg("stride") = py::none(),
-        py::arg("padding") = 0,
-        R"doc(AvgPool1d out-of-place.)doc");
-
-    m.def(
-        "avg_pool1d_",
-        [](::infinicore::Tensor output, ::infinicore::Tensor input, size_t kernel_size, std::optional<size_t> stride, size_t padding) {
-            op::avg_pool1d_(output, input, kernel_size, stride.value_or(0), padding);
-        },
-        py::arg("output"),
-        py::arg("input"),
-        py::arg("kernel_size"),
-        py::arg("stride") = py::none(),
-        py::arg("padding") = 0,
-        R"doc(AvgPool1d in-place variant writing to provided output tensor.)doc");
-}
-
-} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/baddbmm.hpp
+++ b/src/infinicore/pybind11/ops/baddbmm.hpp
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-#include "infinicore/ops/baddbmm.hpp"
-
-namespace py = pybind11;
-
-namespace infinicore::ops {
-
-Tensor py_baddbmm(Tensor input, Tensor batch1, Tensor batch2, float beta = 1.0f, float alpha = 1.0f) {
-    return op::baddbmm(input, batch1, batch2, beta, alpha);
-}
-
-void py_baddbmm_(Tensor out, Tensor input, Tensor batch1, Tensor batch2, float beta = 1.0f, float alpha = 1.0f) {
-    op::baddbmm_(out, input, batch1, batch2, beta, alpha);
-}
-
-inline void bind_baddbmm(py::module &m) {
-    m.def("baddbmm",
-          &py_baddbmm,
-          py::arg("input"),
-          py::arg("batch1"),
-          py::arg("batch2"),
-          py::arg("beta") = 1.0f,
-          py::arg("alpha") = 1.0f,
-          R"doc(Batched matrix-matrix product with addition.
-Args:
-    input: Input tensor
-    batch1: First batch of matrices
-    batch2: Second batch of matrices
-    beta: Scaling factor for input tensor
-    alpha: Scaling factor for the product of batch1 and batch2
-Returns:
-    Output tensor after baddbmm operation
-)doc");
-    m.def("baddbmm_",
-          &py_baddbmm_,
-          py::arg("out"),
-          py::arg("input"),
-          py::arg("batch1"),
-          py::arg("batch2"),
-          py::arg("beta") = 1.0f,
-          py::arg("alpha") = 1.0f,
-          R"doc(In-place batched matrix-matrix product with addition.
-Args:
-    out: Output tensor
-    input: Input tensor
-    batch1: First batch of matrices
-    batch2: Second batch of matrices
-    beta: Scaling factor for input tensor
-    alpha: Scaling factor for the product of batch1 and batch2
-)doc");
-}
-
-} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/bilinear.hpp
+++ b/src/infinicore/pybind11/ops/bilinear.hpp
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-#include "infinicore/ops/bilinear.hpp"
-
-namespace py = pybind11;
-
-namespace infinicore::ops {
-
-Tensor py_bilinear(Tensor x1, Tensor x2, Tensor weight, pybind11::object bias) {
-    std::optional<Tensor> bias_tensor = std::nullopt;
-    if (!bias.is_none()) {
-        bias_tensor = bias.cast<Tensor>();
-    }
-    return op::bilinear(x1, x2, weight, bias_tensor);
-}
-
-void py_bilinear_(Tensor out, Tensor x1, Tensor x2, Tensor weight, pybind11::object bias) {
-    std::optional<Tensor> bias_tensor = std::nullopt;
-    if (!bias.is_none()) {
-        bias_tensor = bias.cast<Tensor>();
-    }
-    op::bilinear_(out, x1, x2, weight, bias_tensor);
-}
-
-inline void bind_bilinear(py::module &m) {
-    m.def("bilinear",
-          &py_bilinear,
-          py::arg("x1"),
-          py::arg("x2"),
-          py::arg("weight"),
-          py::arg("bias"),
-          R"doc(Bilinear transformation of two input tensors.
-Args:
-    x1: First input tensor
-    x2: Second input tensor
-    weight: Weight tensor
-    bias: Bias tensor (optional)
-Returns:
-    Output tensor after bilinear transformation
-)doc");
-
-    m.def("bilinear_",
-          &py_bilinear_,
-          py::arg("out"),
-          py::arg("x1"),
-          py::arg("x2"),
-          py::arg("weight"),
-          py::arg("bias"),
-          R"doc(In-place bilinear transformation of two input tensors.
-Args:
-      out: Output tensor
-      x1: First input tensor
-      x2: Second input tensor
-      weight: Weight tensor
-      bias: Bias tensor (optional)
-)doc");
-}
-
-} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/cross_entropy.hpp
+++ b/src/infinicore/pybind11/ops/cross_entropy.hpp
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-#include "infinicore/ops/cross_entropy.hpp"
-
-namespace py = pybind11;
-
-namespace infinicore::ops {
-
-inline void bind_cross_entropy(py::module &m) {
-    m.def("cross_entropy",
-          &op::cross_entropy,
-          py::arg("logits"),
-          py::arg("target"),
-          R"doc(Token-wise cross entropy loss without reduction.)doc");
-
-    m.def("cross_entropy_",
-          &op::cross_entropy_,
-          py::arg("loss"),
-          py::arg("logits"),
-          py::arg("target"),
-          R"doc(Write cross entropy loss into a provided tensor.)doc");
-}
-
-} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/equal.hpp
+++ b/src/infinicore/pybind11/ops/equal.hpp
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-#include "infinicore/ops/equal.hpp"
-
-namespace py = pybind11;
-
-namespace infinicore::ops {
-
-inline void bind_equal(py::module &m) {
-    m.def("equal",
-          &op::equal,
-          py::arg("a"),
-          py::arg("b"),
-          R"doc(Elementwise equality returning a bool tensor.)doc");
-
-    m.def("equal_",
-          &op::equal_,
-          py::arg("out"),
-          py::arg("a"),
-          py::arg("b"),
-          R"doc(In-place elementwise equality writing into `out`.)doc");
-}
-
-} // namespace infinicore::ops