Merge pull request #1070 from InfiniTensor/issue/1031_revert

Issue/1031 revert T1-1-9

Merge pull request #1070 from InfiniTensor/issue/1031_revert
Issue/1031 revert T1-1-9
b2660e66 · thatPepe · GitHub · 037140c0 · 45a3794b · 037140c0
Unverified Commit b2660e66 authored Mar 11, 2026 by thatPepe Committed by GitHub Mar 11, 2026
20 changed files
--- a/src/infinicore/ops/asinh/asinh_infiniop.cc
+++ b/src/infinicore/ops/asinh/asinh_infiniop.cc
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/asinh.hpp"
-#include "infinicore/ops/common/cache.hpp"
-#include <infiniop.h>
-
-namespace infinicore::op::asinh_impl::infiniop {
-
-thread_local common::OpCache<size_t, infiniopAsinhDescriptor_t> caches(
-    100, // capacity
-    [](infiniopAsinhDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyAsinhDescriptor(desc));
-            desc = nullptr;
-        }
-    });
-
-void calculate(Tensor y, Tensor x) {
-    size_t seed = hash_combine(y, x);
-
-    auto device_type = context::getDevice().getType();
-    auto device_index = context::getDevice().getIndex();
-
-    auto &cache = caches.getCache(device_type, device_index);
-
-    auto desc_opt = cache.get(seed);
-    infiniopAsinhDescriptor_t desc = nullptr;
-
-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateAsinhDescriptor(
-            context::getInfiniopHandle(y->device()), &desc,
-            y->desc(), x->desc()));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
-
-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetAsinhWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
-
-    INFINICORE_CHECK_ERROR(infiniopAsinh(
-        desc, workspace->data(), workspace_size,
-        y->data(), x->data(), context::getStream()));
-}
-
-static bool registered = []() {
-    Asinh::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
-
-} // namespace infinicore::op::asinh_impl::infiniop
--- a/src/infinicore/ops/baddbmm/baddbmm.cc
+++ b/src/infinicore/ops/baddbmm/baddbmm.cc
-#include "infinicore/ops/baddbmm.hpp"
-#include "infinicore/ops/gemm.hpp"
-#include "infinicore/ops/rearrange.hpp"
-
-namespace infinicore::op {
-
-// 内联的 BLAS 兼容性检查，减少函数调用开销
-inline bool is_blas_compatible(const Tensor &t) {
-    const auto ndim = t->ndim();
-    if (ndim == 2) {
-        const auto rs = t->stride(0);
-        const auto cs = t->stride(1);
-        if (rs != 1 && cs != 1) {
-            return false;
-        }
-        if (rs == 1 && cs == 1) {
-            return t->shape()[0] == 1 || t->shape()[1] == 1;
-        }
-        return true;
-    } else if (ndim == 3) {
-        const auto rs = t->stride(1);
-        const auto cs = t->stride(2);
-        if (t->shape()[0] > 1 && t->stride(0) == 0) {
-            return false;
-        }
-        if (rs != 1 && cs != 1) {
-            return false;
-        }
-        if (rs == 1 && cs == 1) {
-            return t->shape()[1] == 1 || t->shape()[2] == 1;
-        }
-        return true;
-    }
-    return false;
-}
-
-inline void prepare_gemm_input(Tensor &output, Tensor &input, const size_t batch_size, const size_t m, const size_t n) {
-    const auto input_ndim = input->ndim();
-    if (input_ndim == 2) {
-        rearrange_(output, input->as_strided(
-                               {batch_size, m, n},
-                               {0, input->stride(0), input->stride(1)}));
-    } else if (input_ndim == 3 && input->shape()[0] == 1 && batch_size > 1) {
-        rearrange_(output, input->as_strided(
-                               {batch_size, m, n},
-                               {0, input->stride(1), input->stride(2)}));
-    } else {
-        rearrange_(output, input);
-    }
-}
-
-Tensor baddbmm(Tensor input, Tensor batch1, Tensor batch2,
-               float beta,
-               float alpha) {
-    const size_t batch_size = batch1->shape()[0];
-    const size_t m = batch1->shape()[1];
-    const size_t n = batch2->shape()[2];
-
-    const Tensor &a = is_blas_compatible(batch1) ? batch1 : rearrange(batch1);
-    const Tensor &b = is_blas_compatible(batch2) ? batch2 : rearrange(batch2);
-
-    if (beta == 0.0f) {
-        return gemm(a, b, alpha, 0.0f);
-    }
-
-    Tensor result = Tensor::empty({batch_size, m, n}, a->dtype(), a->device());
-
-    prepare_gemm_input(result, input, batch_size, m, n);
-
-    gemm_(result, a, b, alpha, beta);
-    return result;
-}
-
-void baddbmm_(Tensor out, Tensor input, Tensor batch1, Tensor batch2,
-              float beta,
-              float alpha) {
-    const size_t batch_size = batch1->shape()[0];
-    const size_t m = batch1->shape()[1];
-    const size_t n = batch2->shape()[2];
-
-    const Tensor &a = is_blas_compatible(batch1) ? batch1 : rearrange(batch1);
-    const Tensor &b = is_blas_compatible(batch2) ? batch2 : rearrange(batch2);
-
-    const bool out_is_usable = out->is_contiguous() && out->ndim() == 3 && out->shape()[0] == batch_size && out->shape()[1] == m && out->shape()[2] == n;
-
-    if (out_is_usable) {
-        if (beta != 0.0f && input->data() != out->data()) {
-            prepare_gemm_input(out, input, batch_size, m, n);
-        }
-        gemm_(out, a, b, alpha, beta);
-    } else {
-        Tensor result = Tensor::empty({batch_size, m, n}, a->dtype(), a->device());
-        if (beta != 0.0f) {
-            prepare_gemm_input(result, input, batch_size, m, n);
-        }
-        gemm_(result, a, b, alpha, beta);
-        rearrange_(out, result);
-    }
-}
-} // namespace infinicore::op
--- a/src/infinicore/ops/bilinear/bilinear.cc
+++ b/src/infinicore/ops/bilinear/bilinear.cc
-#include "infinicore/ops/bilinear.hpp"
-#include "infinicore/ops/add.hpp"
-#include "infinicore/ops/matmul.hpp"
-#include "infinicore/ops/rearrange.hpp"
-
-#ifdef ENABLE_NVIDIA_API
-namespace op::gemm::nvidia {
-void set_tf32_enabled(bool);
-}
-#endif
-
-namespace infinicore::op {
-
-namespace {
-// RAII 守卫：作用域内禁用 TF32
-struct ScopedTF32Disable {
-    ScopedTF32Disable() {
-#ifdef ENABLE_NVIDIA_API
-        // 实际项目中建议添加检查，仅在 NVIDIA 设备上调用
-        // 使用 ::op 强制从全局命名空间查找，避免被当前的 infinicore::op 遮蔽
-        ::op::gemm::nvidia::set_tf32_enabled(false);
-#endif
-    }
-    ~ScopedTF32Disable() {
-#ifdef ENABLE_NVIDIA_API
-        ::op::gemm::nvidia::set_tf32_enabled(true);
-#endif
-    }
-};
-
-inline bool is_gemm_compatible_3d(const Tensor &t) {
-    if (t->ndim() != 3) {
-        return false;
-    }
-
-    const auto batch = t->shape()[0];
-    const auto rows = t->shape()[1];
-    const auto cols = t->shape()[2];
-    const auto bs = t->stride(0);
-    const auto rs = t->stride(1);
-    const auto cs = t->stride(2);
-
-    if (rs != 1 && cs != 1) {
-        return false;
-    }
-
-    if (cs == 1) {
-        if (rs < static_cast<int64_t>(cols)) {
-            return false;
-        }
-    } else {
-        if (cs < static_cast<int64_t>(rows)) {
-            return false;
-        }
-    }
-
-    if (batch > 1 && bs == 0) {
-        return false;
-    }
-
-    return true;
-}
-
-inline Tensor ensure_gemm_compatible(const Tensor &t) {
-    if (t->ndim() == 2) {
-        return t->is_contiguous() ? t : rearrange(t);
-    } else if (t->ndim() == 3) {
-        return is_gemm_compatible_3d(t) ? t : rearrange(t);
-    }
-    return t->is_contiguous() ? t : rearrange(t);
-}
-
-} // anonymous namespace
-
-Tensor bilinear(Tensor x1, Tensor x2, Tensor weight, std::optional<Tensor> bias) {
-    ScopedTF32Disable tf32_guard;
-
-    const size_t batch_size = x1->shape()[0];
-    const size_t in1_features = x1->shape()[1];
-    const size_t in2_features = x2->shape()[1];
-    const size_t out_features = weight->shape()[0];
-
-    Tensor x1_compat = ensure_gemm_compatible(x1);
-    Tensor x2_compat = ensure_gemm_compatible(x2);
-    Tensor weight_cont = weight->is_contiguous() ? weight : weight->contiguous();
-
-    Tensor weight_permuted = weight_cont->permute({1, 0, 2});
-    Tensor weight_permuted_cont = weight_permuted->is_contiguous()
-                                    ? weight_permuted
-                                    : weight_permuted->contiguous();
-    Tensor weight_matrix = weight_permuted_cont->view({in1_features, out_features * in2_features});
-
-    Tensor intermediate = matmul(x1_compat, weight_matrix, 1.0f);
-
-    Tensor intermediate_3d = intermediate->view({batch_size, out_features, in2_features});
-    Tensor intermediate_transposed = intermediate_3d->permute({0, 2, 1});
-    Tensor intermediate_compat = ensure_gemm_compatible(intermediate_transposed);
-
-    Tensor x2_row = x2_compat->view({batch_size, 1, in2_features});
-    Tensor x2_row_compat = ensure_gemm_compatible(x2_row);
-
-    Tensor out_3d = matmul(x2_row_compat, intermediate_compat, 1.0f);
-    Tensor out = out_3d->view({batch_size, out_features});
-
-    if (bias) {
-        Tensor bias_broadcast = (*bias)->as_strided(
-            {batch_size, out_features},
-            {0, (*bias)->strides()[0]});
-        out = add(out, bias_broadcast);
-    }
-    return out;
-}
-
-void bilinear_(Tensor out, Tensor x1, Tensor x2, Tensor weight, std::optional<Tensor> bias) {
-    Tensor result = bilinear(x1, x2, weight, bias);
-    rearrange_(out, result);
-}
-
-} // namespace infinicore::op
--- a/src/infinicore/ops/fmod/fmod.cc
+++ b/src/infinicore/ops/fmod/fmod.cc
-#include "infinicore/ops/fmod.hpp"
-
-#include "../../utils.hpp"
-
-namespace infinicore::op {
-
-common::OpDispatcher<Fmod::schema> &Fmod::dispatcher() {
-    static common::OpDispatcher<Fmod::schema> dispatcher_;
-    return dispatcher_;
-};
-
-void Fmod::execute(Tensor c, Tensor a, Tensor b) {
-    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a, b);
-    infinicore::context::setDevice(c->device());
-    dispatcher().lookup(c->device().getType())(c, a, b);
-}
-
-Tensor fmod(Tensor a, Tensor b) {
-    auto c = Tensor::empty(a->shape(), a->dtype(), a->device());
-    fmod_(c, a, b);
-    return c;
-}
-
-void fmod_(Tensor c, Tensor a, Tensor b) {
-    Fmod::execute(c, a, b);
-}
-
-} // namespace infinicore::op
--- a/src/infinicore/ops/fmod/fmod_infiniop.cc
+++ b/src/infinicore/ops/fmod/fmod_infiniop.cc
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/common/cache.hpp"
-#include "infinicore/ops/fmod.hpp"
-#include <infiniop.h>
-
-namespace infinicore::op::fmod_impl::infiniop {
-
-thread_local common::OpCache<size_t, infiniopFmodDescriptor_t> caches(
-    100, // capacity
-    [](infiniopFmodDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyFmodDescriptor(desc));
-            desc = nullptr;
-        }
-    });
-
-void calculate(Tensor c, Tensor a, Tensor b) {
-    size_t seed = hash_combine(c, b, a);
-
-    auto device_type = context::getDevice().getType();
-    auto device_index = context::getDevice().getIndex();
-
-    auto &cache = caches.getCache(device_type, device_index);
-
-    auto desc_opt = cache.get(seed);
-    infiniopFmodDescriptor_t desc = nullptr;
-
-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateFmodDescriptor(
-            context::getInfiniopHandle(c->device()), &desc,
-            c->desc(), a->desc(), b->desc()));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
-
-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetFmodWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
-
-    INFINICORE_CHECK_ERROR(infiniopFmod(
-        desc, workspace->data(), workspace_size,
-        c->data(), a->data(), b->data(), context::getStream()));
-}
-
-static bool registered = []() {
-    Fmod::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
-
-} // namespace infinicore::op::fmod_impl::infiniop
--- a/src/infinicore/pybind11/ops.hpp
+++ b/src/infinicore/pybind11/ops.hpp
@@ -2,21 +2,16 @@

 #include <pybind11/pybind11.h>

-#include "ops/adaptive_max_pool1d.hpp"
 #include "ops/add.hpp"
 #include "ops/add_rms_norm.hpp"
 #include "ops/all.hpp"
-#include "ops/asinh.hpp"
 #include "ops/attention.hpp"
 #include "ops/avg_pool1d.hpp"
-#include "ops/baddbmm.hpp"
-#include "ops/bilinear.hpp"
 #include "ops/causal_softmax.hpp"
 #include "ops/cross_entropy.hpp"
 #include "ops/embedding.hpp"
 #include "ops/equal.hpp"
 #include "ops/flash_attention.hpp"
-#include "ops/fmod.hpp"
 #include "ops/hardswish.hpp"
 #include "ops/hardtanh.hpp"
 #include "ops/kv_caching.hpp"
@@ -45,18 +40,12 @@ namespace py = pybind11;
 namespace infinicore::ops {

 inline void bind(py::module &m) {
-    bind_adaptive_max_pool1d(m);
    bind_add(m);
    bind_add_rms_norm(m);
    bind_attention(m);
-    bind_asinh(m);
-    bind_baddbmm(m);
-    bind_bilinear(m);
    bind_causal_softmax(m);
    bind_flash_attention(m);
    bind_kv_caching(m);
-    bind_fmod(m);
-    bind_random_sample(m);
    bind_linear(m);
    bind_matmul(m);
    bind_mul(m);
@@ -77,12 +66,12 @@ inline void bind(py::module &m) {
    bind_embedding(m);
    bind_linear_w8a8i8(m);
    bind_silu_and_mul(m);
-    bind_equal(m);
    bind_sum(m);
    bind_var_mean(m);
    bind_var(m);
    bind_topk(m);
    bind_all(m);
+    bind_equal(m);
 }

 } // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/adaptive_max_pool1d.hpp
+++ b/src/infinicore/pybind11/ops/adaptive_max_pool1d.hpp
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-#include "infinicore/ops/adaptive_max_pool1d.hpp"
-
-namespace py = pybind11;
-
-namespace infinicore::ops {
-
-inline void bind_adaptive_max_pool1d(py::module &m) {
-    m.def("adaptive_max_pool1d",
-          &op::adaptive_max_pool1d,
-          py::arg("x"),
-          py::arg("output_size"),
-          R"doc(1D Adaptive Max Pooling.
-
-Args:
-    x: Input tensor of shape (N, C, L_in) or (N, L_in)
-    output_size: Target output size L_out
-Returns:
-    Output tensor of shape (N, C, L_out) or (N, L_out)
-)doc");
-
-    m.def("adaptive_max_pool1d_",
-          &op::adaptive_max_pool1d_,
-          py::arg("y"),
-          py::arg("x"),
-          py::arg("output_size"),
-          R"doc(In-place 1D Adaptive Max Pooling.
-
-Args:
-    y: Output tensor of shape (N, C, L_out) or (N, L_out)
-    x: Input tensor of shape (N, C, L_in) or (N, L_in)
-    output_size: Target output size L_out
-)doc");
-}
-
-} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/asinh.hpp
+++ b/src/infinicore/pybind11/ops/asinh.hpp
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-#include "infinicore/ops/asinh.hpp"
-
-namespace py = pybind11;
-
-namespace infinicore::ops {
-
-inline void bind_asinh(py::module &m) {
-    m.def("asinh",
-          &op::asinh,
-          py::arg("x"),
-          R"doc(Element-wise inverse hyperbolic sine function.)doc");
-
-    m.def("asinh_",
-          &op::asinh_,
-          py::arg("y"),
-          py::arg("x"),
-          R"doc(In-place element-wise inverse hyperbolic sine function.)doc");
-}
-
-} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/baddbmm.hpp
+++ b/src/infinicore/pybind11/ops/baddbmm.hpp
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-#include "infinicore/ops/baddbmm.hpp"
-
-namespace py = pybind11;
-
-namespace infinicore::ops {
-
-Tensor py_baddbmm(Tensor input, Tensor batch1, Tensor batch2, float beta = 1.0f, float alpha = 1.0f) {
-    return op::baddbmm(input, batch1, batch2, beta, alpha);
-}
-
-void py_baddbmm_(Tensor out, Tensor input, Tensor batch1, Tensor batch2, float beta = 1.0f, float alpha = 1.0f) {
-    op::baddbmm_(out, input, batch1, batch2, beta, alpha);
-}
-
-inline void bind_baddbmm(py::module &m) {
-    m.def("baddbmm",
-          &py_baddbmm,
-          py::arg("input"),
-          py::arg("batch1"),
-          py::arg("batch2"),
-          py::arg("beta") = 1.0f,
-          py::arg("alpha") = 1.0f,
-          R"doc(Batched matrix-matrix product with addition.
-Args:
-    input: Input tensor
-    batch1: First batch of matrices
-    batch2: Second batch of matrices
-    beta: Scaling factor for input tensor
-    alpha: Scaling factor for the product of batch1 and batch2
-Returns:
-    Output tensor after baddbmm operation
-)doc");
-    m.def("baddbmm_",
-          &py_baddbmm_,
-          py::arg("out"),
-          py::arg("input"),
-          py::arg("batch1"),
-          py::arg("batch2"),
-          py::arg("beta") = 1.0f,
-          py::arg("alpha") = 1.0f,
-          R"doc(In-place batched matrix-matrix product with addition.
-Args:
-    out: Output tensor
-    input: Input tensor
-    batch1: First batch of matrices
-    batch2: Second batch of matrices
-    beta: Scaling factor for input tensor
-    alpha: Scaling factor for the product of batch1 and batch2
-)doc");
-}
-
-} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/bilinear.hpp
+++ b/src/infinicore/pybind11/ops/bilinear.hpp
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-#include "infinicore/ops/bilinear.hpp"
-
-namespace py = pybind11;
-
-namespace infinicore::ops {
-
-Tensor py_bilinear(Tensor x1, Tensor x2, Tensor weight, pybind11::object bias) {
-    std::optional<Tensor> bias_tensor = std::nullopt;
-    if (!bias.is_none()) {
-        bias_tensor = bias.cast<Tensor>();
-    }
-    return op::bilinear(x1, x2, weight, bias_tensor);
-}
-
-void py_bilinear_(Tensor out, Tensor x1, Tensor x2, Tensor weight, pybind11::object bias) {
-    std::optional<Tensor> bias_tensor = std::nullopt;
-    if (!bias.is_none()) {
-        bias_tensor = bias.cast<Tensor>();
-    }
-    op::bilinear_(out, x1, x2, weight, bias_tensor);
-}
-
-inline void bind_bilinear(py::module &m) {
-    m.def("bilinear",
-          &py_bilinear,
-          py::arg("x1"),
-          py::arg("x2"),
-          py::arg("weight"),
-          py::arg("bias"),
-          R"doc(Bilinear transformation of two input tensors.
-Args:
-    x1: First input tensor
-    x2: Second input tensor
-    weight: Weight tensor
-    bias: Bias tensor (optional)
-Returns:
-    Output tensor after bilinear transformation
-)doc");
-
-    m.def("bilinear_",
-          &py_bilinear_,
-          py::arg("out"),
-          py::arg("x1"),
-          py::arg("x2"),
-          py::arg("weight"),
-          py::arg("bias"),
-          R"doc(In-place bilinear transformation of two input tensors.
-Args:
-      out: Output tensor
-      x1: First input tensor
-      x2: Second input tensor
-      weight: Weight tensor
-      bias: Bias tensor (optional)
-)doc");
-}
-
-} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/fmod.hpp
+++ b/src/infinicore/pybind11/ops/fmod.hpp
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-#include "infinicore/ops/fmod.hpp"
-
-namespace py = pybind11;
-
-namespace infinicore::ops {
-
-inline void bind_fmod(py::module &m) {
-    m.def("fmod",
-          &op::fmod,
-          py::arg("a"),
-          py::arg("b"),
-          R"doc(Element-wise floating point remainder of division of two tensors.)doc");
-
-    m.def("fmod_",
-          &op::fmod_,
-          py::arg("c"),
-          py::arg("a"),
-          py::arg("b"),
-          R"doc(In-place element-wise floating point remainder of division of two tensors.)doc");
-}
-
-} // namespace infinicore::ops
--- a/src/infiniop/ops/adaptive_max_pool1d/adaptive_max_pool1d.h
+++ b/src/infiniop/ops/adaptive_max_pool1d/adaptive_max_pool1d.h
-#ifndef ADAPTIVE_MAX_POOL1D_H
-#define ADAPTIVE_MAX_POOL1D_H
-
-#include "../../operator.h"
-#include "info.h"
-
-#define DESCRIPTOR(NAMESPACE)                                    \
-                                                                 \
-    namespace op::adaptive_max_pool1d::NAMESPACE {               \
-    class Descriptor final : public InfiniopDescriptor {         \
-        struct Opaque;                                           \
-        Opaque *_opaque;                                         \
-        AdaptiveMaxPool1dInfo _info;                             \
-        size_t _workspace_size;                                  \
-                                                                 \
-        Descriptor(                                              \
-            Opaque *opaque,                                      \
-            AdaptiveMaxPool1dInfo info,                          \
-            size_t workspace_size,                               \
-            infiniDevice_t device_type,                          \
-            int device_id)                                       \
-            : InfiniopDescriptor{device_type, device_id},        \
-              _opaque(opaque),                                   \
-              _info(info),                                       \
-              _workspace_size(workspace_size) {}                 \
-                                                                 \
-    public:                                                      \
-        ~Descriptor();                                           \
-                                                                 \
-        size_t workspaceSize() const { return _workspace_size; } \
-                                                                 \
-        static infiniStatus_t create(                            \
-            infiniopHandle_t handle,                             \
-            Descriptor **desc_ptr,                               \
-            infiniopTensorDescriptor_t y_desc,                   \
-            infiniopTensorDescriptor_t x_desc,                   \
-            size_t output_size);                                 \
-                                                                 \
-        infiniStatus_t calculate(                                \
-            void *workspace, size_t workspace_size,              \
-            void *y,                                             \
-            const void *x,                                       \
-            void *stream) const;                                 \
-    };                                                           \
-    }
-
-#endif // ADAPTIVE_MAX_POOL1D_H
--- a/src/infiniop/ops/adaptive_max_pool1d/cpu/adaptive_max_pool1d_cpu.cc
+++ b/src/infiniop/ops/adaptive_max_pool1d/cpu/adaptive_max_pool1d_cpu.cc
-#include "adaptive_max_pool1d_cpu.h"
-#include "../../../devices/cpu/common_cpu.h"
-#include "../../../reduce/cpu/reduce.h"
-#include <algorithm>
-#include <cmath>
-
-namespace op::adaptive_max_pool1d::cpu {
-
-Descriptor::~Descriptor() {}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc,
-    size_t output_size) {
-    auto result = AdaptiveMaxPool1dInfo::create(y_desc, x_desc, output_size);
-    CHECK_RESULT(result);
-    *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-template <typename T>
-infiniStatus_t adaptiveMaxPool1d(const AdaptiveMaxPool1dInfo *info, T *y, const T *x) {
-
-    const size_t ndim = info->ndim();
-    const size_t batch_size = info->shape[0];
-    const size_t channels = ndim > 2 ? info->shape[1] : 1;
-
-    const size_t input_length = info->input_length();
-    const size_t output_length = info->output_length();
-
-    // 计算总的任务块数 (Batch * Channels)
-    const ptrdiff_t total_blocks = static_cast<ptrdiff_t>(batch_size * channels);
-
-    const ptrdiff_t x_stride_last = info->x_strides.back();
-
-#pragma omp parallel for
-    for (ptrdiff_t block_idx = 0; block_idx < total_blocks; ++block_idx) {
-        const size_t i = block_idx / channels; // batch index
-        const size_t j = block_idx % channels; // channel index
-
-        const T *x_ptr_base;
-        T *y_ptr_base;
-
-        if (ndim > 2) { // (N, C, L)
-            x_ptr_base = x + i * info->x_strides[0] + j * info->x_strides[1];
-            y_ptr_base = y + i * info->y_strides[0] + j * info->y_strides[1];
-        } else { // (N, L)
-            x_ptr_base = x + i * info->x_strides[0];
-            y_ptr_base = y + i * info->y_strides[0];
-        }
-
-        for (size_t out_idx = 0; out_idx < output_length; ++out_idx) {
-            size_t start_index = (out_idx * input_length) / output_length;
-            size_t end_index = ((out_idx + 1) * input_length + output_length - 1) / output_length;
-
-            start_index = std::max(start_index, size_t(0));
-            end_index = std::min(end_index, input_length);
-            size_t window_len = end_index - start_index;
-
-            if (window_len <= 0) {
-                continue;
-            }
-
-            const T *window_ptr = x_ptr_base + start_index * x_stride_last;
-
-            auto max_val = op::common_cpu::reduce_op::max(window_ptr, window_len, x_stride_last);
-            y_ptr_base[out_idx] = utils::cast<T>(max_val);
-        }
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace, size_t workspace_size,
-    void *y, const void *x,
-    void *stream) const {
-
-    if (_info.atype == INFINI_DTYPE_F32) {
-        return adaptiveMaxPool1d(&_info, (float *)y, (const float *)x);
-    } else if (_info.atype == INFINI_DTYPE_F16) {
-        return adaptiveMaxPool1d(&_info, (fp16_t *)y, (const fp16_t *)x);
-    } else if (_info.atype == INFINI_DTYPE_BF16) {
-        return adaptiveMaxPool1d(&_info, (bf16_t *)y, (const bf16_t *)x);
-    } else if (_info.atype == INFINI_DTYPE_F64) {
-        return adaptiveMaxPool1d(&_info, (double *)y, (const double *)x);
-    }
-
-    return INFINI_STATUS_BAD_TENSOR_DTYPE;
-}
-
-} // namespace op::adaptive_max_pool1d::cpu
--- a/src/infiniop/ops/adaptive_max_pool1d/cpu/adaptive_max_pool1d_cpu.h
+++ b/src/infiniop/ops/adaptive_max_pool1d/cpu/adaptive_max_pool1d_cpu.h
-#ifndef __ADAPTIVE_MAX_POOL1D_CPU_H__
-#define __ADAPTIVE_MAX_POOL1D_CPU_H__
-
-#include "../adaptive_max_pool1d.h"
-
-DESCRIPTOR(cpu)
-
-#endif
--- a/src/infiniop/ops/adaptive_max_pool1d/cuda/kernel.cuh
+++ b/src/infiniop/ops/adaptive_max_pool1d/cuda/kernel.cuh
-#ifndef __ADAPTIVE_MAX_POOL1D_CUDA_KERNEL_H__
-#define __ADAPTIVE_MAX_POOL1D_CUDA_KERNEL_H__
-
-#include <cmath>
-#include <limits>
-
-template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
-__device__ void adaptiveMaxPool1dBlock(
-    Tdata *__restrict__ y,
-    ptrdiff_t stride_y_batch,
-    ptrdiff_t stride_y_channel,
-    const Tdata *__restrict__ x,
-    ptrdiff_t stride_x_batch,
-    ptrdiff_t stride_x_channel,
-    ptrdiff_t stride_x_length,
-    size_t channels,
-    size_t input_length,
-    size_t output_length,
-    size_t ndim) {
-
-    size_t block_idx = blockIdx.x;
-    size_t batch_idx = block_idx / channels;
-    size_t channel_idx = block_idx % channels;
-
-    const Tdata *x_ptr;
-    Tdata *y_ptr;
-
-    if (ndim > 2) {
-        x_ptr = x + batch_idx * stride_x_batch + channel_idx * stride_x_channel;
-        y_ptr = y + batch_idx * stride_y_batch + channel_idx * stride_y_channel;
-    } else {
-        x_ptr = x + batch_idx * stride_x_batch;
-        y_ptr = y + batch_idx * stride_y_batch;
-    }
-
-    for (size_t out_idx = threadIdx.x; out_idx < output_length; out_idx += BLOCK_SIZE) {
-        int start_index = static_cast<int>(floorf((float)out_idx * input_length / output_length));
-        int end_index = static_cast<int>(ceilf((float)(out_idx + 1) * input_length / output_length));
-
-        if (end_index <= start_index) {
-            continue;
-        }
-
-        Tcompute max_val = Tcompute(x_ptr[start_index * stride_x_length]);
-        for (int i = start_index + 1; i < end_index; ++i) {
-            Tcompute val = Tcompute(x_ptr[i * stride_x_length]);
-            max_val = max(max_val, val);
-        }
-
-        y_ptr[out_idx] = Tdata(max_val);
-    }
-}
-
-#endif
--- a/src/infiniop/ops/adaptive_max_pool1d/info.h
+++ b/src/infiniop/ops/adaptive_max_pool1d/info.h
-#ifndef __ADAPATIVE_MAX_POOL1D_H__
-#define __ADAPATIVE_MAX_POOL1D_H__
-
-#include "../../../utils.h"
-#include "../../tensor.h"
-#include <vector>
-
-namespace op::adaptive_max_pool1d {
-
-class AdaptiveMaxPool1dInfo {
-    AdaptiveMaxPool1dInfo() = default;
-
-public:
-    infiniDtype_t atype;
-    std::vector<size_t> shape;
-    std::vector<ptrdiff_t> y_strides;
-    std::vector<ptrdiff_t> x_strides;
-    size_t input_size;
-    size_t output_size;
-    size_t ndim() const { return shape.size(); }
-    size_t input_length() const { return input_size; }
-    size_t output_length() const { return output_size; }
-
-    static utils::Result<AdaptiveMaxPool1dInfo> create(
-        infiniopTensorDescriptor_t y_desc,
-        infiniopTensorDescriptor_t x_desc,
-        size_t output_size) {
-
-        auto atype = y_desc->dtype();
-        if (x_desc->dtype() != atype) {
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;
-        }
-        if (atype != INFINI_DTYPE_F16 && atype != INFINI_DTYPE_BF16 && atype != INFINI_DTYPE_F32 && atype != INFINI_DTYPE_F64) {
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;
-        }
-
-        const size_t y_ndim = y_desc->ndim();
-        const size_t x_ndim = x_desc->ndim();
-
-        if (y_ndim != x_ndim) {
-            return INFINI_STATUS_BAD_TENSOR_SHAPE;
-        }
-
-        for (size_t i = 0; i < y_ndim - 1; ++i) {
-            if (x_desc->dim(i) != y_desc->dim(i)) {
-                return INFINI_STATUS_BAD_TENSOR_SHAPE;
-            }
-        }
-
-        if (y_desc->dim(y_ndim - 1) != output_size) {
-            return INFINI_STATUS_BAD_TENSOR_SHAPE;
-        }
-
-        return utils::Result<AdaptiveMaxPool1dInfo>(AdaptiveMaxPool1dInfo{
-            atype,
-            y_desc->shape(),
-            y_desc->strides(),
-            x_desc->strides(),
-            x_desc->dim(x_ndim - 1),
-            output_size});
-    }
-};
-} // namespace op::adaptive_max_pool1d
-
-#endif // __ADAPATIVE_MAX_POOL1D_H__
--- a/src/infiniop/ops/adaptive_max_pool1d/metax/adaptive_max_pool1d_metax.cuh
+++ b/src/infiniop/ops/adaptive_max_pool1d/metax/adaptive_max_pool1d_metax.cuh
-#ifndef __ADAPTIVE_MAX_POOL1D_METAX_CUH__
-#define __ADAPTIVE_MAX_POOL1D_METAX_CUH__
-
-#include "../adaptive_max_pool1d.h"
-
-DESCRIPTOR(metax)
-
-#endif
--- a/src/infiniop/ops/adaptive_max_pool1d/metax/adaptive_max_pool1d_metax.maca
+++ b/src/infiniop/ops/adaptive_max_pool1d/metax/adaptive_max_pool1d_metax.maca
-#include "../../../devices/metax/metax_common.h"
-#include "adaptive_max_pool1d_metax.cuh"
-
-#include "../../../devices/metax/metax_kernel_common.h"
-
-#include "../cuda/kernel.cuh"
-
-template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
-INFINIOP_METAX_KERNEL adaptiveMaxPool1dKernel(
-    Tdata *__restrict__ y,
-    ptrdiff_t stride_y_batch,
-    ptrdiff_t stride_y_channel,
-    const Tdata *__restrict__ x,
-    ptrdiff_t stride_x_batch,
-    ptrdiff_t stride_x_channel,
-    ptrdiff_t stride_x_length,
-    size_t channels,
-    size_t input_length,
-    size_t output_length,
-    size_t ndim) {
-
-    adaptiveMaxPool1dBlock<BLOCK_SIZE, Tdata, Tcompute>(
-        y, stride_y_batch, stride_y_channel,
-        x, stride_x_batch, stride_x_channel, stride_x_length,
-        channels, input_length, output_length,ndim);
-}
-
-namespace op::adaptive_max_pool1d::metax {
-
-struct Descriptor::Opaque {
-    std::shared_ptr<device::metax::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor(){
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc,
-    size_t output_size) {
-
-    auto result = AdaptiveMaxPool1dInfo::create(y_desc, x_desc, output_size);
-    CHECK_RESULT(result);
-    auto info = result.take();
-
-    *desc_ptr = new Descriptor(
-        new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
-        std::move(info),
-        0,
-        handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-template <unsigned int BLOCK_SIZE>
-infiniStatus_t launchKernel(
-    uint32_t numblock,
-    void *y, infiniDtype_t dtype,
-    ptrdiff_t stride_y_batch, ptrdiff_t stride_y_channel,
-    const void *x,
-    ptrdiff_t stride_x_batch, ptrdiff_t stride_x_channel, ptrdiff_t stride_x_length,
-    size_t channels, size_t input_length, size_t output_length, size_t ndim,
-    hcStream_t stream){
-
-#define LAUNCH_KERNEL(Tdata, Tcompute)                                                          \
-    adaptiveMaxPool1dKernel<BLOCK_SIZE, Tdata, Tcompute><<<numblock, BLOCK_SIZE, 0, stream>>> ( \
-        reinterpret_cast<Tdata *>(y),                                                           \
-        stride_y_batch, stride_y_channel,                                                       \
-        reinterpret_cast<const Tdata *>(x),                                                  \
-        stride_x_batch, stride_x_channel, stride_x_length,                                   \
-        channels, input_length, output_length, ndim)
-
-    if (dtype == INFINI_DTYPE_F16) {
-        LAUNCH_KERNEL(half, float);
-    } else if (dtype == INFINI_DTYPE_BF16) {
-        LAUNCH_KERNEL(__hpcc_bfloat16, float);
-    } else if (dtype == INFINI_DTYPE_F32) {
-        LAUNCH_KERNEL(float, float);
-    } else if (dtype == INFINI_DTYPE_F64) {
-        LAUNCH_KERNEL(double, double);
-    } else {
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-#undef LAUNCH_KERNEL
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace, size_t workspace_size,
-    void *y, const void *x,
-    void *stream_) const {
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    const size_t ndim = _info.ndim();
-    const size_t batch_size = _info.shape[0];
-    const size_t channels = ndim > 2 ? _info.shape[1] : 1;
-    const size_t input_length = _info.input_length();
-    const size_t output_length = _info.output_length();
-
-    ptrdiff_t stride_x_batch = _info.x_strides[0];
-    ptrdiff_t stride_x_channel = ndim > 2 ? _info.x_strides[1] : 0;
-    ptrdiff_t stride_x_length = _info.x_strides.back();
-
-    ptrdiff_t stride_y_batch = _info.y_strides[0];
-    ptrdiff_t stride_y_channel = ndim > 2 ? _info.y_strides[1] : 0;
-
-    uint32_t num_blocks = static_cast<uint32_t>(batch_size * channels);
-    auto stream = reinterpret_cast<hcStream_t>(stream_);
-
-    if (_opaque->internal->maxThreadsPerBlock() >= METAX_BLOCK_SIZE_1024) {
-        CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_1024>(
-            num_blocks, y, _info.atype,
-            stride_y_batch, stride_y_channel,
-            x, stride_x_batch, stride_x_channel, stride_x_length,
-            channels, input_length, output_length, ndim,
-            stream));
-    } else {
-        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace op::adaptive_max_pool1d::metax
--- a/src/infiniop/ops/adaptive_max_pool1d/moore/adaptive_max_pool1d_moore.h
+++ b/src/infiniop/ops/adaptive_max_pool1d/moore/adaptive_max_pool1d_moore.h
-#ifndef __ADAPTIVE_MAX_POOL1D_MOOORE_H__
-#define __ADAPTIVE_MAX_POOL1D_MOOORE_H__
-
-#include "../adaptive_max_pool1d.h"
-
-DESCRIPTOR(moore)
-
-#endif
--- a/src/infiniop/ops/adaptive_max_pool1d/moore/adaptive_max_pool1d_moore.mu
+++ b/src/infiniop/ops/adaptive_max_pool1d/moore/adaptive_max_pool1d_moore.mu
-#include "../../../devices/moore/moore_common.h"
-#include "adaptive_max_pool1d_moore.h"
-
-#include "../../../devices/moore/moore_kernel_common.h"
-
-#include "../cuda/kernel.cuh"
-
-template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
-INFINIOP_MOORE_KERNEL adaptiveMaxPool1dKernel(
-    Tdata *__restrict__ y,
-    ptrdiff_t stride_y_batch,
-    ptrdiff_t stride_y_channel,
-    const Tdata *__restrict__ x,
-    ptrdiff_t stride_x_batch,
-    ptrdiff_t stride_x_channel,
-    ptrdiff_t stride_x_length,
-    size_t channels,
-    size_t input_length,
-    size_t output_length,
-    size_t ndim){
-
-    adaptiveMaxPool1dBlock<BLOCK_SIZE, Tdata, Tcompute>(
-        y, stride_y_batch, stride_y_channel,
-        x, stride_x_batch, stride_x_channel, stride_x_length,
-        channels, input_length, output_length, ndim);
-}
-
-namespace op::adaptive_max_pool1d::moore {
-
-struct Descriptor::Opaque {
-    std::shared_ptr<device::moore::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor() {
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc,
-    size_t output_size) {
-    auto result = AdaptiveMaxPool1dInfo::create(y_desc, x_desc, output_size);
-    CHECK_RESULT(result);
-    auto info = result.take();
-
-    *desc_ptr = new Descriptor(
-        new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
-        std::move(info),
-        0,
-        handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-template <unsigned int BLOCK_SIZE>
-infiniStatus_t launchKernel(
-    uint32_t num_blocks,
-    void *y, infiniDtype_t dtype,
-    ptrdiff_t stride_y_batch, ptrdiff_t stride_y_channel,
-    const void *x,
-    ptrdiff_t stride_x_batch, ptrdiff_t stride_x_channel, ptrdiff_t stride_x_length,
-    size_t channels, size_t input_length, size_t output_length, size_t ndim,
-    musaStream_t musa_stream) {
-
-#define LAUNCH_KERNEL(Tdata, Tcompute)                                                       \
-    adaptiveMaxPool1dKernel<BLOCK_SIZE, Tdata, Tcompute><<<num_blocks, BLOCK_SIZE, 0, musa_stream>>>( \
-        reinterpret_cast<Tdata *>(y),                                                        \
-        stride_y_batch, stride_y_channel,                                                    \
-        reinterpret_cast<const Tdata *>(x),                                                  \
-        stride_x_batch, stride_x_channel, stride_x_length,                                   \
-        channels, input_length, output_length, ndim)
-
-    if (dtype == INFINI_DTYPE_F16) {
-        LAUNCH_KERNEL(half, float);
-    } else if (dtype == INFINI_DTYPE_BF16) {
-        LAUNCH_KERNEL(__mt_bfloat16, float);
-    } else if (dtype == INFINI_DTYPE_F32) {
-        LAUNCH_KERNEL(float, float);
-    } else if (dtype == INFINI_DTYPE_F64) {
-        LAUNCH_KERNEL(double, double);
-    } else {
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-#undef LAUNCH_KERNEL
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace, size_t workspace_size,
-    void *y, const void *x,
-    void *stream) const {
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    const size_t ndim = _info.ndim();
-    const size_t batch_size = _info.shape[0];
-    const size_t channels = ndim > 2 ? _info.shape[1] : 1;
-    const size_t input_length = _info.input_length();
-    const size_t output_length = _info.output_length();
-
-    ptrdiff_t stride_x_batch = _info.x_strides[0];
-    ptrdiff_t stride_x_channel = ndim > 2 ? _info.x_strides[1] : 0;
-    ptrdiff_t stride_x_length = _info.x_strides.back();
-
-    ptrdiff_t stride_y_batch = _info.y_strides[0];
-    ptrdiff_t stride_y_channel = ndim > 2 ? _info.y_strides[1] : 0;
-
-    uint32_t num_blocks = static_cast<uint32_t>(batch_size * channels);
-    auto musa_stream = reinterpret_cast<musaStream_t>(stream);
-
-    if (_opaque->internal->maxThreadsPerBlock() >= MOORE_BLOCK_SIZE_1024) {
-        CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_1024>(
-            num_blocks, y, _info.atype,
-            stride_y_batch, stride_y_channel,
-            x, stride_x_batch, stride_x_channel, stride_x_length,
-            channels, input_length, output_length, ndim,
-            musa_stream));
-    } else if (_opaque->internal->maxThreadsPerBlock() >= MOORE_BLOCK_SIZE_512) {
-        CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_512>(
-            num_blocks, y, _info.atype,
-            stride_y_batch, stride_y_channel,
-            x, stride_x_batch, stride_x_channel, stride_x_length,
-            channels, input_length, output_length, ndim,
-            musa_stream));
-    } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_2048) {
-        CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_2048>(
-            num_blocks, y, _info.atype,
-            stride_y_batch, stride_y_channel,
-            x, stride_x_batch, stride_x_channel, stride_x_length,
-            channels, input_length, output_length, ndim,
-            musa_stream));
-    } else {
-        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace op::adaptive_max_pool1d::moore