Merge pull request #1071 from InfiniTensor/issue/1031_T1-1-9

【算子比赛2025秋】T1-1-9

Merge pull request #1071 from InfiniTensor/issue/1031_T1-1-9
【算子比赛2025秋】T1-1-9
09d4b2ae · thatPepe · GitHub · 5fc85c8b · 85f8987c · 09d4b2ae
Unverified Commit 09d4b2ae authored Mar 12, 2026 by thatPepe Committed by GitHub Mar 12, 2026
20 changed files
--- a/src/infinicore/ops/asinh/asinh_infiniop.cc
+++ b/src/infinicore/ops/asinh/asinh_infiniop.cc
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/asinh.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::asinh_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopAsinhDescriptor_t> caches(
+    100, // capacity
+    [](infiniopAsinhDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyAsinhDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor y, Tensor x) {
+    size_t seed = hash_combine(y, x);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopAsinhDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateAsinhDescriptor(
+            context::getInfiniopHandle(y->device()), &desc,
+            y->desc(), x->desc()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetAsinhWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopAsinh(
+        desc, workspace->data(), workspace_size,
+        y->data(), x->data(), context::getStream()));
+}
+
+static bool registered = []() {
+    Asinh::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::asinh_impl::infiniop
--- a/src/infinicore/ops/baddbmm/baddbmm.cc
+++ b/src/infinicore/ops/baddbmm/baddbmm.cc
+#include "infinicore/ops/baddbmm.hpp"
+#include "infinicore/ops/gemm.hpp"
+#include "infinicore/ops/rearrange.hpp"
+
+namespace infinicore::op {
+
+// 内联的 BLAS 兼容性检查，减少函数调用开销
+inline bool is_blas_compatible(const Tensor &t) {
+    const auto ndim = t->ndim();
+    if (ndim == 2) {
+        const auto rs = t->stride(0);
+        const auto cs = t->stride(1);
+        if (rs != 1 && cs != 1) {
+            return false;
+        }
+        if (rs == 1 && cs == 1) {
+            return t->shape()[0] == 1 || t->shape()[1] == 1;
+        }
+        return true;
+    } else if (ndim == 3) {
+        const auto rs = t->stride(1);
+        const auto cs = t->stride(2);
+        if (t->shape()[0] > 1 && t->stride(0) == 0) {
+            return false;
+        }
+        if (rs != 1 && cs != 1) {
+            return false;
+        }
+        if (rs == 1 && cs == 1) {
+            return t->shape()[1] == 1 || t->shape()[2] == 1;
+        }
+        return true;
+    }
+    return false;
+}
+
+inline void prepare_gemm_input(Tensor &output, Tensor &input, const size_t batch_size, const size_t m, const size_t n) {
+    const auto input_ndim = input->ndim();
+    if (input_ndim == 2) {
+        rearrange_(output, input->as_strided(
+                               {batch_size, m, n},
+                               {0, input->stride(0), input->stride(1)}));
+    } else if (input_ndim == 3 && input->shape()[0] == 1 && batch_size > 1) {
+        rearrange_(output, input->as_strided(
+                               {batch_size, m, n},
+                               {0, input->stride(1), input->stride(2)}));
+    } else {
+        rearrange_(output, input);
+    }
+}
+
+Tensor baddbmm(Tensor input, Tensor batch1, Tensor batch2,
+               float beta,
+               float alpha) {
+    const size_t batch_size = batch1->shape()[0];
+    const size_t m = batch1->shape()[1];
+    const size_t n = batch2->shape()[2];
+
+    const Tensor &a = is_blas_compatible(batch1) ? batch1 : rearrange(batch1);
+    const Tensor &b = is_blas_compatible(batch2) ? batch2 : rearrange(batch2);
+
+    if (beta == 0.0f) {
+        return gemm(a, b, alpha, 0.0f);
+    }
+
+    Tensor result = Tensor::empty({batch_size, m, n}, a->dtype(), a->device());
+
+    prepare_gemm_input(result, input, batch_size, m, n);
+
+    gemm_(result, a, b, alpha, beta);
+    return result;
+}
+
+void baddbmm_(Tensor out, Tensor input, Tensor batch1, Tensor batch2,
+              float beta,
+              float alpha) {
+    const size_t batch_size = batch1->shape()[0];
+    const size_t m = batch1->shape()[1];
+    const size_t n = batch2->shape()[2];
+
+    const Tensor &a = is_blas_compatible(batch1) ? batch1 : rearrange(batch1);
+    const Tensor &b = is_blas_compatible(batch2) ? batch2 : rearrange(batch2);
+
+    const bool out_is_usable = out->is_contiguous() && out->ndim() == 3 && out->shape()[0] == batch_size && out->shape()[1] == m && out->shape()[2] == n;
+
+    if (out_is_usable) {
+        if (beta != 0.0f && input->data() != out->data()) {
+            prepare_gemm_input(out, input, batch_size, m, n);
+        }
+        gemm_(out, a, b, alpha, beta);
+    } else {
+        Tensor result = Tensor::empty({batch_size, m, n}, a->dtype(), a->device());
+        if (beta != 0.0f) {
+            prepare_gemm_input(result, input, batch_size, m, n);
+        }
+        gemm_(result, a, b, alpha, beta);
+        rearrange_(out, result);
+    }
+}
+} // namespace infinicore::op
--- a/src/infinicore/ops/bilinear/bilinear.cc
+++ b/src/infinicore/ops/bilinear/bilinear.cc
+#include "infinicore/ops/bilinear.hpp"
+#include "infinicore/ops/add.hpp"
+#include "infinicore/ops/matmul.hpp"
+#include "infinicore/ops/rearrange.hpp"
+
+namespace infinicore::op {
+
+namespace {
+inline bool is_gemm_compatible_3d(const Tensor &t) {
+    if (t->ndim() != 3) {
+        return false;
+    }
+
+    const auto batch = t->shape()[0];
+    const auto rows = t->shape()[1];
+    const auto cols = t->shape()[2];
+    const auto bs = t->stride(0);
+    const auto rs = t->stride(1);
+    const auto cs = t->stride(2);
+
+    if (rs != 1 && cs != 1) {
+        return false;
+    }
+
+    if (cs == 1) {
+        if (rs < static_cast<int64_t>(cols)) {
+            return false;
+        }
+    } else {
+        if (cs < static_cast<int64_t>(rows)) {
+            return false;
+        }
+    }
+
+    if (batch > 1 && bs == 0) {
+        return false;
+    }
+
+    return true;
+}
+
+inline Tensor ensure_gemm_compatible(const Tensor &t) {
+    if (t->ndim() == 2) {
+        return t->is_contiguous() ? t : rearrange(t);
+    } else if (t->ndim() == 3) {
+        return is_gemm_compatible_3d(t) ? t : rearrange(t);
+    }
+    return t->is_contiguous() ? t : rearrange(t);
+}
+
+} // anonymous namespace
+
+Tensor bilinear(Tensor x1, Tensor x2, Tensor weight, std::optional<Tensor> bias) {
+    const size_t batch_size = x1->shape()[0];
+    const size_t in1_features = x1->shape()[1];
+    const size_t in2_features = x2->shape()[1];
+    const size_t out_features = weight->shape()[0];
+
+    Tensor x1_compat = ensure_gemm_compatible(x1);
+    Tensor x2_compat = ensure_gemm_compatible(x2);
+    Tensor weight_cont = weight->is_contiguous() ? weight : weight->contiguous();
+
+    Tensor weight_permuted = weight_cont->permute({1, 0, 2});
+    Tensor weight_permuted_cont = weight_permuted->is_contiguous()
+                                    ? weight_permuted
+                                    : weight_permuted->contiguous();
+    Tensor weight_matrix = weight_permuted_cont->view({in1_features, out_features * in2_features});
+
+    Tensor intermediate = matmul(x1_compat, weight_matrix, 1.0f);
+
+    Tensor intermediate_3d = intermediate->view({batch_size, out_features, in2_features});
+    Tensor intermediate_transposed = intermediate_3d->permute({0, 2, 1});
+    Tensor intermediate_compat = ensure_gemm_compatible(intermediate_transposed);
+
+    Tensor x2_row = x2_compat->view({batch_size, 1, in2_features});
+    Tensor x2_row_compat = ensure_gemm_compatible(x2_row);
+
+    Tensor out_3d = matmul(x2_row_compat, intermediate_compat, 1.0f);
+    Tensor out = out_3d->view({batch_size, out_features});
+
+    if (bias) {
+        Tensor bias_broadcast = (*bias)->as_strided(
+            {batch_size, out_features},
+            {0, (*bias)->strides()[0]});
+        out = add(out, bias_broadcast);
+    }
+    return out;
+}
+
+void bilinear_(Tensor out, Tensor x1, Tensor x2, Tensor weight, std::optional<Tensor> bias) {
+    Tensor result = bilinear(x1, x2, weight, bias);
+    rearrange_(out, result);
+}
+
+} // namespace infinicore::op
--- a/src/infinicore/ops/fmod/fmod.cc
+++ b/src/infinicore/ops/fmod/fmod.cc
+#include "infinicore/ops/fmod.hpp"
+
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<Fmod::schema> &Fmod::dispatcher() {
+    static common::OpDispatcher<Fmod::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void Fmod::execute(Tensor c, Tensor a, Tensor b) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a, b);
+    infinicore::context::setDevice(c->device());
+    dispatcher().lookup(c->device().getType())(c, a, b);
+}
+
+Tensor fmod(Tensor a, Tensor b) {
+    auto c = Tensor::empty(a->shape(), a->dtype(), a->device());
+    fmod_(c, a, b);
+    return c;
+}
+
+void fmod_(Tensor c, Tensor a, Tensor b) {
+    Fmod::execute(c, a, b);
+}
+
+} // namespace infinicore::op
--- a/src/infinicore/ops/fmod/fmod_infiniop.cc
+++ b/src/infinicore/ops/fmod/fmod_infiniop.cc
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/fmod.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::fmod_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopFmodDescriptor_t> caches(
+    100, // capacity
+    [](infiniopFmodDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyFmodDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor c, Tensor a, Tensor b) {
+    size_t seed = hash_combine(c, b, a);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopFmodDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateFmodDescriptor(
+            context::getInfiniopHandle(c->device()), &desc,
+            c->desc(), a->desc(), b->desc()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetFmodWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopFmod(
+        desc, workspace->data(), workspace_size,
+        c->data(), a->data(), b->data(), context::getStream()));
+}
+
+static bool registered = []() {
+    Fmod::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::fmod_impl::infiniop
--- a/src/infinicore/pybind11/ops.hpp
+++ b/src/infinicore/pybind11/ops.hpp
@@ -2,13 +2,17 @@

 #include <pybind11/pybind11.h>

+#include "ops/adaptive_max_pool1d.hpp"
 #include "ops/add.hpp"
 #include "ops/add_rms_norm.hpp"
 #include "ops/addcmul.hpp"
 #include "ops/all.hpp"
+#include "ops/asinh.hpp"
 #include "ops/atanh.hpp"
 #include "ops/attention.hpp"
 #include "ops/avg_pool1d.hpp"
+#include "ops/baddbmm.hpp"
+#include "ops/bilinear.hpp"
 #include "ops/binary_cross_entropy_with_logits.hpp"
 #include "ops/causal_softmax.hpp"
 #include "ops/cdist.hpp"
@@ -16,6 +20,7 @@
 #include "ops/embedding.hpp"
 #include "ops/equal.hpp"
 #include "ops/flash_attention.hpp"
+#include "ops/fmod.hpp"
 #include "ops/hardswish.hpp"
 #include "ops/hardtanh.hpp"
 #include "ops/kv_caching.hpp"
@@ -46,12 +51,18 @@ namespace py = pybind11;
 namespace infinicore::ops {

 inline void bind(py::module &m) {
+    bind_adaptive_max_pool1d(m);
    bind_add(m);
    bind_add_rms_norm(m);
    bind_attention(m);
+    bind_asinh(m);
+    bind_baddbmm(m);
+    bind_bilinear(m);
    bind_causal_softmax(m);
    bind_flash_attention(m);
    bind_kv_caching(m);
+    bind_fmod(m);
+    bind_random_sample(m);
    bind_linear(m);
    bind_matmul(m);
    bind_mul(m);

--- a/src/infinicore/pybind11/ops/adaptive_max_pool1d.hpp
+++ b/src/infinicore/pybind11/ops/adaptive_max_pool1d.hpp
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/adaptive_max_pool1d.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_adaptive_max_pool1d(py::module &m) {
+    m.def("adaptive_max_pool1d",
+          &op::adaptive_max_pool1d,
+          py::arg("x"),
+          py::arg("output_size"),
+          R"doc(1D Adaptive Max Pooling.
+
+Args:
+    x: Input tensor of shape (N, C, L_in) or (N, L_in)
+    output_size: Target output size L_out
+Returns:
+    Output tensor of shape (N, C, L_out) or (N, L_out)
+)doc");
+
+    m.def("adaptive_max_pool1d_",
+          &op::adaptive_max_pool1d_,
+          py::arg("y"),
+          py::arg("x"),
+          py::arg("output_size"),
+          R"doc(In-place 1D Adaptive Max Pooling.
+
+Args:
+    y: Output tensor of shape (N, C, L_out) or (N, L_out)
+    x: Input tensor of shape (N, C, L_in) or (N, L_in)
+    output_size: Target output size L_out
+)doc");
+}
+
+} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/asinh.hpp
+++ b/src/infinicore/pybind11/ops/asinh.hpp
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/asinh.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_asinh(py::module &m) {
+    m.def("asinh",
+          &op::asinh,
+          py::arg("x"),
+          R"doc(Element-wise inverse hyperbolic sine function.)doc");
+
+    m.def("asinh_",
+          &op::asinh_,
+          py::arg("y"),
+          py::arg("x"),
+          R"doc(In-place element-wise inverse hyperbolic sine function.)doc");
+}
+
+} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/baddbmm.hpp
+++ b/src/infinicore/pybind11/ops/baddbmm.hpp
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/baddbmm.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+Tensor py_baddbmm(Tensor input, Tensor batch1, Tensor batch2, float beta = 1.0f, float alpha = 1.0f) {
+    return op::baddbmm(input, batch1, batch2, beta, alpha);
+}
+
+void py_baddbmm_(Tensor out, Tensor input, Tensor batch1, Tensor batch2, float beta = 1.0f, float alpha = 1.0f) {
+    op::baddbmm_(out, input, batch1, batch2, beta, alpha);
+}
+
+inline void bind_baddbmm(py::module &m) {
+    m.def("baddbmm",
+          &py_baddbmm,
+          py::arg("input"),
+          py::arg("batch1"),
+          py::arg("batch2"),
+          py::arg("beta") = 1.0f,
+          py::arg("alpha") = 1.0f,
+          R"doc(Batched matrix-matrix product with addition.
+Args:
+    input: Input tensor
+    batch1: First batch of matrices
+    batch2: Second batch of matrices
+    beta: Scaling factor for input tensor
+    alpha: Scaling factor for the product of batch1 and batch2
+Returns:
+    Output tensor after baddbmm operation
+)doc");
+    m.def("baddbmm_",
+          &py_baddbmm_,
+          py::arg("out"),
+          py::arg("input"),
+          py::arg("batch1"),
+          py::arg("batch2"),
+          py::arg("beta") = 1.0f,
+          py::arg("alpha") = 1.0f,
+          R"doc(In-place batched matrix-matrix product with addition.
+Args:
+    out: Output tensor
+    input: Input tensor
+    batch1: First batch of matrices
+    batch2: Second batch of matrices
+    beta: Scaling factor for input tensor
+    alpha: Scaling factor for the product of batch1 and batch2
+)doc");
+}
+
+} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/bilinear.hpp
+++ b/src/infinicore/pybind11/ops/bilinear.hpp
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/bilinear.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+Tensor py_bilinear(Tensor x1, Tensor x2, Tensor weight, pybind11::object bias) {
+    std::optional<Tensor> bias_tensor = std::nullopt;
+    if (!bias.is_none()) {
+        bias_tensor = bias.cast<Tensor>();
+    }
+    return op::bilinear(x1, x2, weight, bias_tensor);
+}
+
+void py_bilinear_(Tensor out, Tensor x1, Tensor x2, Tensor weight, pybind11::object bias) {
+    std::optional<Tensor> bias_tensor = std::nullopt;
+    if (!bias.is_none()) {
+        bias_tensor = bias.cast<Tensor>();
+    }
+    op::bilinear_(out, x1, x2, weight, bias_tensor);
+}
+
+inline void bind_bilinear(py::module &m) {
+    m.def("bilinear",
+          &py_bilinear,
+          py::arg("x1"),
+          py::arg("x2"),
+          py::arg("weight"),
+          py::arg("bias"),
+          R"doc(Bilinear transformation of two input tensors.
+Args:
+    x1: First input tensor
+    x2: Second input tensor
+    weight: Weight tensor
+    bias: Bias tensor (optional)
+Returns:
+    Output tensor after bilinear transformation
+)doc");
+
+    m.def("bilinear_",
+          &py_bilinear_,
+          py::arg("out"),
+          py::arg("x1"),
+          py::arg("x2"),
+          py::arg("weight"),
+          py::arg("bias"),
+          R"doc(In-place bilinear transformation of two input tensors.
+Args:
+      out: Output tensor
+      x1: First input tensor
+      x2: Second input tensor
+      weight: Weight tensor
+      bias: Bias tensor (optional)
+)doc");
+}
+
+} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/fmod.hpp
+++ b/src/infinicore/pybind11/ops/fmod.hpp
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/fmod.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_fmod(py::module &m) {
+    m.def("fmod",
+          &op::fmod,
+          py::arg("a"),
+          py::arg("b"),
+          R"doc(Element-wise floating point remainder of division of two tensors.)doc");
+
+    m.def("fmod_",
+          &op::fmod_,
+          py::arg("c"),
+          py::arg("a"),
+          py::arg("b"),
+          R"doc(In-place element-wise floating point remainder of division of two tensors.)doc");
+}
+
+} // namespace infinicore::ops
--- a/src/infiniop/ops/adaptive_max_pool1d/adaptive_max_pool1d.h
+++ b/src/infiniop/ops/adaptive_max_pool1d/adaptive_max_pool1d.h
+#ifndef ADAPTIVE_MAX_POOL1D_H
+#define ADAPTIVE_MAX_POOL1D_H
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::adaptive_max_pool1d::NAMESPACE {               \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        AdaptiveMaxPool1dInfo _info;                             \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            Opaque *opaque,                                      \
+            AdaptiveMaxPool1dInfo info,                          \
+            size_t workspace_size,                               \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(info),                                       \
+              _workspace_size(workspace_size) {}                 \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t y_desc,                   \
+            infiniopTensorDescriptor_t x_desc,                   \
+            size_t output_size);                                 \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace, size_t workspace_size,              \
+            void *y,                                             \
+            const void *x,                                       \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // ADAPTIVE_MAX_POOL1D_H
--- a/src/infiniop/ops/adaptive_max_pool1d/cpu/adaptive_max_pool1d_cpu.cc
+++ b/src/infiniop/ops/adaptive_max_pool1d/cpu/adaptive_max_pool1d_cpu.cc
+#include "adaptive_max_pool1d_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../reduce/cpu/reduce.h"
+#include <algorithm>
+#include <cmath>
+
+namespace op::adaptive_max_pool1d::cpu {
+
+Descriptor::~Descriptor() {}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t output_size) {
+    auto result = AdaptiveMaxPool1dInfo::create(y_desc, x_desc, output_size);
+    CHECK_RESULT(result);
+    *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename T>
+infiniStatus_t adaptiveMaxPool1d(const AdaptiveMaxPool1dInfo *info, T *y, const T *x) {
+
+    const size_t ndim = info->ndim();
+    const size_t batch_size = info->shape[0];
+    const size_t channels = ndim > 2 ? info->shape[1] : 1;
+
+    const size_t input_length = info->input_length();
+    const size_t output_length = info->output_length();
+
+    // 计算总的任务块数 (Batch * Channels)
+    const ptrdiff_t total_blocks = static_cast<ptrdiff_t>(batch_size * channels);
+
+    const ptrdiff_t x_stride_last = info->x_strides.back();
+
+#pragma omp parallel for
+    for (ptrdiff_t block_idx = 0; block_idx < total_blocks; ++block_idx) {
+        const size_t i = block_idx / channels; // batch index
+        const size_t j = block_idx % channels; // channel index
+
+        const T *x_ptr_base;
+        T *y_ptr_base;
+
+        if (ndim > 2) { // (N, C, L)
+            x_ptr_base = x + i * info->x_strides[0] + j * info->x_strides[1];
+            y_ptr_base = y + i * info->y_strides[0] + j * info->y_strides[1];
+        } else { // (N, L)
+            x_ptr_base = x + i * info->x_strides[0];
+            y_ptr_base = y + i * info->y_strides[0];
+        }
+
+        for (size_t out_idx = 0; out_idx < output_length; ++out_idx) {
+            size_t start_index = (out_idx * input_length) / output_length;
+            size_t end_index = ((out_idx + 1) * input_length + output_length - 1) / output_length;
+
+            start_index = std::max(start_index, size_t(0));
+            end_index = std::min(end_index, input_length);
+            size_t window_len = end_index - start_index;
+
+            if (window_len <= 0) {
+                continue;
+            }
+
+            const T *window_ptr = x_ptr_base + start_index * x_stride_last;
+
+            auto max_val = op::common_cpu::reduce_op::max(window_ptr, window_len, x_stride_last);
+            y_ptr_base[out_idx] = utils::cast<T>(max_val);
+        }
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size,
+    void *y, const void *x,
+    void *stream) const {
+
+    if (_info.atype == INFINI_DTYPE_F32) {
+        return adaptiveMaxPool1d(&_info, (float *)y, (const float *)x);
+    } else if (_info.atype == INFINI_DTYPE_F16) {
+        return adaptiveMaxPool1d(&_info, (fp16_t *)y, (const fp16_t *)x);
+    } else if (_info.atype == INFINI_DTYPE_BF16) {
+        return adaptiveMaxPool1d(&_info, (bf16_t *)y, (const bf16_t *)x);
+    } else if (_info.atype == INFINI_DTYPE_F64) {
+        return adaptiveMaxPool1d(&_info, (double *)y, (const double *)x);
+    }
+
+    return INFINI_STATUS_BAD_TENSOR_DTYPE;
+}
+
+} // namespace op::adaptive_max_pool1d::cpu
--- a/src/infiniop/ops/adaptive_max_pool1d/cpu/adaptive_max_pool1d_cpu.h
+++ b/src/infiniop/ops/adaptive_max_pool1d/cpu/adaptive_max_pool1d_cpu.h
+#ifndef __ADAPTIVE_MAX_POOL1D_CPU_H__
+#define __ADAPTIVE_MAX_POOL1D_CPU_H__
+
+#include "../adaptive_max_pool1d.h"
+
+DESCRIPTOR(cpu)
+
+#endif
--- a/src/infiniop/ops/adaptive_max_pool1d/cuda/kernel.cuh
+++ b/src/infiniop/ops/adaptive_max_pool1d/cuda/kernel.cuh
+#ifndef __ADAPTIVE_MAX_POOL1D_CUDA_KERNEL_H__
+#define __ADAPTIVE_MAX_POOL1D_CUDA_KERNEL_H__
+
+#include <cmath>
+#include <limits>
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+__device__ void adaptiveMaxPool1dBlock(
+    Tdata *__restrict__ y,
+    ptrdiff_t stride_y_batch,
+    ptrdiff_t stride_y_channel,
+    const Tdata *__restrict__ x,
+    ptrdiff_t stride_x_batch,
+    ptrdiff_t stride_x_channel,
+    ptrdiff_t stride_x_length,
+    size_t channels,
+    size_t input_length,
+    size_t output_length,
+    size_t ndim) {
+
+    size_t block_idx = blockIdx.x;
+    size_t batch_idx = block_idx / channels;
+    size_t channel_idx = block_idx % channels;
+
+    const Tdata *x_ptr;
+    Tdata *y_ptr;
+
+    if (ndim > 2) {
+        x_ptr = x + batch_idx * stride_x_batch + channel_idx * stride_x_channel;
+        y_ptr = y + batch_idx * stride_y_batch + channel_idx * stride_y_channel;
+    } else {
+        x_ptr = x + batch_idx * stride_x_batch;
+        y_ptr = y + batch_idx * stride_y_batch;
+    }
+
+    for (size_t out_idx = threadIdx.x; out_idx < output_length; out_idx += BLOCK_SIZE) {
+        int start_index = static_cast<int>(floorf((float)out_idx * input_length / output_length));
+        int end_index = static_cast<int>(ceilf((float)(out_idx + 1) * input_length / output_length));
+
+        if (end_index <= start_index) {
+            continue;
+        }
+
+        Tcompute max_val = Tcompute(x_ptr[start_index * stride_x_length]);
+        for (int i = start_index + 1; i < end_index; ++i) {
+            Tcompute val = Tcompute(x_ptr[i * stride_x_length]);
+            max_val = max(max_val, val);
+        }
+
+        y_ptr[out_idx] = Tdata(max_val);
+    }
+}
+
+#endif
--- a/src/infiniop/ops/adaptive_max_pool1d/info.h
+++ b/src/infiniop/ops/adaptive_max_pool1d/info.h
+#ifndef __ADAPATIVE_MAX_POOL1D_H__
+#define __ADAPATIVE_MAX_POOL1D_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::adaptive_max_pool1d {
+
+class AdaptiveMaxPool1dInfo {
+    AdaptiveMaxPool1dInfo() = default;
+
+public:
+    infiniDtype_t atype;
+    std::vector<size_t> shape;
+    std::vector<ptrdiff_t> y_strides;
+    std::vector<ptrdiff_t> x_strides;
+    size_t input_size;
+    size_t output_size;
+    size_t ndim() const { return shape.size(); }
+    size_t input_length() const { return input_size; }
+    size_t output_length() const { return output_size; }
+
+    static utils::Result<AdaptiveMaxPool1dInfo> create(
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        size_t output_size) {
+
+        auto atype = y_desc->dtype();
+        if (x_desc->dtype() != atype) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+        if (atype != INFINI_DTYPE_F16 && atype != INFINI_DTYPE_BF16 && atype != INFINI_DTYPE_F32 && atype != INFINI_DTYPE_F64) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        const size_t y_ndim = y_desc->ndim();
+        const size_t x_ndim = x_desc->ndim();
+
+        if (y_ndim != x_ndim) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        for (size_t i = 0; i < y_ndim - 1; ++i) {
+            if (x_desc->dim(i) != y_desc->dim(i)) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+        }
+
+        if (y_desc->dim(y_ndim - 1) != output_size) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        return utils::Result<AdaptiveMaxPool1dInfo>(AdaptiveMaxPool1dInfo{
+            atype,
+            y_desc->shape(),
+            y_desc->strides(),
+            x_desc->strides(),
+            x_desc->dim(x_ndim - 1),
+            output_size});
+    }
+};
+} // namespace op::adaptive_max_pool1d
+
+#endif // __ADAPATIVE_MAX_POOL1D_H__
--- a/src/infiniop/ops/adaptive_max_pool1d/metax/adaptive_max_pool1d_metax.cuh
+++ b/src/infiniop/ops/adaptive_max_pool1d/metax/adaptive_max_pool1d_metax.cuh
+#ifndef __ADAPTIVE_MAX_POOL1D_METAX_CUH__
+#define __ADAPTIVE_MAX_POOL1D_METAX_CUH__
+
+#include "../adaptive_max_pool1d.h"
+
+DESCRIPTOR(metax)
+
+#endif
--- a/src/infiniop/ops/adaptive_max_pool1d/metax/adaptive_max_pool1d_metax.maca
+++ b/src/infiniop/ops/adaptive_max_pool1d/metax/adaptive_max_pool1d_metax.maca
+#include "../../../devices/metax/metax_common.h"
+#include "adaptive_max_pool1d_metax.cuh"
+
+#include "../../../devices/metax/metax_kernel_common.h"
+
+#include "../cuda/kernel.cuh"
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+INFINIOP_METAX_KERNEL adaptiveMaxPool1dKernel(
+    Tdata *__restrict__ y,
+    ptrdiff_t stride_y_batch,
+    ptrdiff_t stride_y_channel,
+    const Tdata *__restrict__ x,
+    ptrdiff_t stride_x_batch,
+    ptrdiff_t stride_x_channel,
+    ptrdiff_t stride_x_length,
+    size_t channels,
+    size_t input_length,
+    size_t output_length,
+    size_t ndim) {
+
+    adaptiveMaxPool1dBlock<BLOCK_SIZE, Tdata, Tcompute>(
+        y, stride_y_batch, stride_y_channel,
+        x, stride_x_batch, stride_x_channel, stride_x_length,
+        channels, input_length, output_length, ndim);
+}
+
+namespace op::adaptive_max_pool1d::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t output_size) {
+
+    auto result = AdaptiveMaxPool1dInfo::create(y_desc, x_desc, output_size);
+    CHECK_RESULT(result);
+    auto info = result.take();
+
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
+        std::move(info),
+        0,
+        handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchKernel(
+    uint32_t numblock,
+    void *y, infiniDtype_t dtype,
+    ptrdiff_t stride_y_batch, ptrdiff_t stride_y_channel,
+    const void *x,
+    ptrdiff_t stride_x_batch, ptrdiff_t stride_x_channel, ptrdiff_t stride_x_length,
+    size_t channels, size_t input_length, size_t output_length, size_t ndim,
+    hcStream_t stream) {
+
+#define LAUNCH_KERNEL(Tdata, Tcompute)                                                         \
+    adaptiveMaxPool1dKernel<BLOCK_SIZE, Tdata, Tcompute><<<numblock, BLOCK_SIZE, 0, stream>>>( \
+        reinterpret_cast<Tdata *>(y),                                                          \
+        stride_y_batch, stride_y_channel,                                                      \
+        reinterpret_cast<const Tdata *>(x),                                                    \
+        stride_x_batch, stride_x_channel, stride_x_length,                                     \
+        channels, input_length, output_length, ndim)
+
+    if (dtype == INFINI_DTYPE_F16) {
+        LAUNCH_KERNEL(half, float);
+    } else if (dtype == INFINI_DTYPE_BF16) {
+        LAUNCH_KERNEL(__hpcc_bfloat16, float);
+    } else if (dtype == INFINI_DTYPE_F32) {
+        LAUNCH_KERNEL(float, float);
+    } else if (dtype == INFINI_DTYPE_F64) {
+        LAUNCH_KERNEL(double, double);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+#undef LAUNCH_KERNEL
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size,
+    void *y, const void *x,
+    void *stream_) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    const size_t ndim = _info.ndim();
+    const size_t batch_size = _info.shape[0];
+    const size_t channels = ndim > 2 ? _info.shape[1] : 1;
+    const size_t input_length = _info.input_length();
+    const size_t output_length = _info.output_length();
+
+    ptrdiff_t stride_x_batch = _info.x_strides[0];
+    ptrdiff_t stride_x_channel = ndim > 2 ? _info.x_strides[1] : 0;
+    ptrdiff_t stride_x_length = _info.x_strides.back();
+
+    ptrdiff_t stride_y_batch = _info.y_strides[0];
+    ptrdiff_t stride_y_channel = ndim > 2 ? _info.y_strides[1] : 0;
+
+    uint32_t num_blocks = static_cast<uint32_t>(batch_size * channels);
+    auto stream = reinterpret_cast<hcStream_t>(stream_);
+
+    if (_opaque->internal->maxThreadsPerBlock() >= METAX_BLOCK_SIZE_1024) {
+        CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_1024>(
+            num_blocks, y, _info.atype,
+            stride_y_batch, stride_y_channel,
+            x, stride_x_batch, stride_x_channel, stride_x_length,
+            channels, input_length, output_length, ndim,
+            stream));
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::adaptive_max_pool1d::metax
--- a/src/infiniop/ops/adaptive_max_pool1d/moore/adaptive_max_pool1d_moore.h
+++ b/src/infiniop/ops/adaptive_max_pool1d/moore/adaptive_max_pool1d_moore.h
+#ifndef __ADAPTIVE_MAX_POOL1D_MOOORE_H__
+#define __ADAPTIVE_MAX_POOL1D_MOOORE_H__
+
+#include "../adaptive_max_pool1d.h"
+
+DESCRIPTOR(moore)
+
+#endif
--- a/src/infiniop/ops/adaptive_max_pool1d/moore/adaptive_max_pool1d_moore.mu
+++ b/src/infiniop/ops/adaptive_max_pool1d/moore/adaptive_max_pool1d_moore.mu
+#include "../../../devices/moore/moore_common.h"
+#include "adaptive_max_pool1d_moore.h"
+
+#include "../../../devices/moore/moore_kernel_common.h"
+
+#include "../cuda/kernel.cuh"
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+INFINIOP_MOORE_KERNEL adaptiveMaxPool1dKernel(
+    Tdata *__restrict__ y,
+    ptrdiff_t stride_y_batch,
+    ptrdiff_t stride_y_channel,
+    const Tdata *__restrict__ x,
+    ptrdiff_t stride_x_batch,
+    ptrdiff_t stride_x_channel,
+    ptrdiff_t stride_x_length,
+    size_t channels,
+    size_t input_length,
+    size_t output_length,
+    size_t ndim) {
+
+    adaptiveMaxPool1dBlock<BLOCK_SIZE, Tdata, Tcompute>(
+        y, stride_y_batch, stride_y_channel,
+        x, stride_x_batch, stride_x_channel, stride_x_length,
+        channels, input_length, output_length, ndim);
+}
+
+namespace op::adaptive_max_pool1d::moore {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::moore::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t output_size) {
+    auto result = AdaptiveMaxPool1dInfo::create(y_desc, x_desc, output_size);
+    CHECK_RESULT(result);
+    auto info = result.take();
+
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
+        std::move(info),
+        0,
+        handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchKernel(
+    uint32_t num_blocks,
+    void *y, infiniDtype_t dtype,
+    ptrdiff_t stride_y_batch, ptrdiff_t stride_y_channel,
+    const void *x,
+    ptrdiff_t stride_x_batch, ptrdiff_t stride_x_channel, ptrdiff_t stride_x_length,
+    size_t channels, size_t input_length, size_t output_length, size_t ndim,
+    musaStream_t musa_stream) {
+
+#define LAUNCH_KERNEL(Tdata, Tcompute)                                                                \
+    adaptiveMaxPool1dKernel<BLOCK_SIZE, Tdata, Tcompute><<<num_blocks, BLOCK_SIZE, 0, musa_stream>>>( \
+        reinterpret_cast<Tdata *>(y),                                                                 \
+        stride_y_batch, stride_y_channel,                                                             \
+        reinterpret_cast<const Tdata *>(x),                                                           \
+        stride_x_batch, stride_x_channel, stride_x_length,                                            \
+        channels, input_length, output_length, ndim)
+
+    if (dtype == INFINI_DTYPE_F16) {
+        LAUNCH_KERNEL(half, float);
+    } else if (dtype == INFINI_DTYPE_BF16) {
+        LAUNCH_KERNEL(__mt_bfloat16, float);
+    } else if (dtype == INFINI_DTYPE_F32) {
+        LAUNCH_KERNEL(float, float);
+    } else if (dtype == INFINI_DTYPE_F64) {
+        LAUNCH_KERNEL(double, double);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+#undef LAUNCH_KERNEL
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size,
+    void *y, const void *x,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    const size_t ndim = _info.ndim();
+    const size_t batch_size = _info.shape[0];
+    const size_t channels = ndim > 2 ? _info.shape[1] : 1;
+    const size_t input_length = _info.input_length();
+    const size_t output_length = _info.output_length();
+
+    ptrdiff_t stride_x_batch = _info.x_strides[0];
+    ptrdiff_t stride_x_channel = ndim > 2 ? _info.x_strides[1] : 0;
+    ptrdiff_t stride_x_length = _info.x_strides.back();
+
+    ptrdiff_t stride_y_batch = _info.y_strides[0];
+    ptrdiff_t stride_y_channel = ndim > 2 ? _info.y_strides[1] : 0;
+
+    uint32_t num_blocks = static_cast<uint32_t>(batch_size * channels);
+    auto musa_stream = reinterpret_cast<musaStream_t>(stream);
+
+    if (_opaque->internal->maxThreadsPerBlock() >= MOORE_BLOCK_SIZE_1024) {
+        CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_1024>(
+            num_blocks, y, _info.atype,
+            stride_y_batch, stride_y_channel,
+            x, stride_x_batch, stride_x_channel, stride_x_length,
+            channels, input_length, output_length, ndim,
+            musa_stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() >= MOORE_BLOCK_SIZE_512) {
+        CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_512>(
+            num_blocks, y, _info.atype,
+            stride_y_batch, stride_y_channel,
+            x, stride_x_batch, stride_x_channel, stride_x_length,
+            channels, input_length, output_length, ndim,
+            musa_stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_2048) {
+        CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_2048>(
+            num_blocks, y, _info.atype,
+            stride_y_batch, stride_y_channel,
+            x, stride_x_batch, stride_x_channel, stride_x_length,
+            channels, input_length, output_length, ndim,
+            musa_stream));
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::adaptive_max_pool1d::moore