Revert "Merge pull request #1056 from InfiniTensor/issue/1031"

This reverts commit 7f295448, reversing changes made to e60985dc.

Revert "Merge pull request #1056 from InfiniTensor/issue/1031"
This reverts commit 7f295448, reversing changes made to e60985dc.
cb7f0b7d · wooway777 · 037140c0 · 037140c0 · 037140c0 · 037140c0
Commit cb7f0b7d authored Mar 11, 2026 by wooway777
20 changed files
--- a/src/infinicore/pybind11/ops/fmod.hpp
+++ b/src/infinicore/pybind11/ops/fmod.hpp
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-#include "infinicore/ops/fmod.hpp"
-
-namespace py = pybind11;
-
-namespace infinicore::ops {
-
-inline void bind_fmod(py::module &m) {
-    m.def("fmod",
-          &op::fmod,
-          py::arg("a"),
-          py::arg("b"),
-          R"doc(Element-wise floating point remainder of division of two tensors.)doc");
-
-    m.def("fmod_",
-          &op::fmod_,
-          py::arg("c"),
-          py::arg("a"),
-          py::arg("b"),
-          R"doc(In-place element-wise floating point remainder of division of two tensors.)doc");
-}
-
-} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/hardswish.hpp
+++ b/src/infinicore/pybind11/ops/hardswish.hpp
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-#include "infinicore/ops/hardswish.hpp"
-
-namespace py = pybind11;
-
-namespace infinicore::ops {
-
-inline void bind_hardswish(py::module &m) {
-    m.def("hardswish",
-          &op::hardswish,
-          py::arg("input"),
-          R"doc(Out-of-place Hardswish activation.)doc");
-
-    m.def("hardswish_",
-          &op::hardswish_,
-          py::arg("output"),
-          py::arg("input"),
-          R"doc(In-place Hardswish activation.)doc");
-}
-
-} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/hardtanh.hpp
+++ b/src/infinicore/pybind11/ops/hardtanh.hpp
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-#include "infinicore/ops/hardtanh.hpp"
-
-namespace py = pybind11;
-
-namespace infinicore::ops {
-
-inline void bind_hardtanh(py::module &m) {
-    m.def("hardtanh",
-          &op::hardtanh,
-          py::arg("input"),
-          py::arg("min_val") = -1.0f,
-          py::arg("max_val") = 1.0f,
-          R"doc(Apply the HardTanh activation.)doc");
-
-    m.def("hardtanh_",
-          &op::hardtanh_,
-          py::arg("output"),
-          py::arg("input"),
-          py::arg("min_val") = -1.0f,
-          py::arg("max_val") = 1.0f,
-          R"doc(In-place HardTanh activation.)doc");
-}
-
-} // namespace infinicore::ops
--- a/src/infiniop/ops/adaptive_max_pool1d/adaptive_max_pool1d.h
+++ b/src/infiniop/ops/adaptive_max_pool1d/adaptive_max_pool1d.h
-#ifndef ADAPTIVE_MAX_POOL1D_H
-#define ADAPTIVE_MAX_POOL1D_H
-
-#include "../../operator.h"
-#include "info.h"
-
-#define DESCRIPTOR(NAMESPACE)                                    \
-                                                                 \
-    namespace op::adaptive_max_pool1d::NAMESPACE {               \
-    class Descriptor final : public InfiniopDescriptor {         \
-        struct Opaque;                                           \
-        Opaque *_opaque;                                         \
-        AdaptiveMaxPool1dInfo _info;                             \
-        size_t _workspace_size;                                  \
-                                                                 \
-        Descriptor(                                              \
-            Opaque *opaque,                                      \
-            AdaptiveMaxPool1dInfo info,                          \
-            size_t workspace_size,                               \
-            infiniDevice_t device_type,                          \
-            int device_id)                                       \
-            : InfiniopDescriptor{device_type, device_id},        \
-              _opaque(opaque),                                   \
-              _info(info),                                       \
-              _workspace_size(workspace_size) {}                 \
-                                                                 \
-    public:                                                      \
-        ~Descriptor();                                           \
-                                                                 \
-        size_t workspaceSize() const { return _workspace_size; } \
-                                                                 \
-        static infiniStatus_t create(                            \
-            infiniopHandle_t handle,                             \
-            Descriptor **desc_ptr,                               \
-            infiniopTensorDescriptor_t y_desc,                   \
-            infiniopTensorDescriptor_t x_desc,                   \
-            size_t output_size);                                 \
-                                                                 \
-        infiniStatus_t calculate(                                \
-            void *workspace, size_t workspace_size,              \
-            void *y,                                             \
-            const void *x,                                       \
-            void *stream) const;                                 \
-    };                                                           \
-    }
-
-#endif // ADAPTIVE_MAX_POOL1D_H
--- a/src/infiniop/ops/adaptive_max_pool1d/cpu/adaptive_max_pool1d_cpu.cc
+++ b/src/infiniop/ops/adaptive_max_pool1d/cpu/adaptive_max_pool1d_cpu.cc
-#include "adaptive_max_pool1d_cpu.h"
-#include "../../../devices/cpu/common_cpu.h"
-#include "../../../reduce/cpu/reduce.h"
-#include <algorithm>
-#include <cmath>
-
-namespace op::adaptive_max_pool1d::cpu {
-
-Descriptor::~Descriptor() {}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc,
-    size_t output_size) {
-    auto result = AdaptiveMaxPool1dInfo::create(y_desc, x_desc, output_size);
-    CHECK_RESULT(result);
-    *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-template <typename T>
-infiniStatus_t adaptiveMaxPool1d(const AdaptiveMaxPool1dInfo *info, T *y, const T *x) {
-
-    const size_t ndim = info->ndim();
-    const size_t batch_size = info->shape[0];
-    const size_t channels = ndim > 2 ? info->shape[1] : 1;
-
-    const size_t input_length = info->input_length();
-    const size_t output_length = info->output_length();
-
-    // 计算总的任务块数 (Batch * Channels)
-    const ptrdiff_t total_blocks = static_cast<ptrdiff_t>(batch_size * channels);
-
-    const ptrdiff_t x_stride_last = info->x_strides.back();
-
-#pragma omp parallel for
-    for (ptrdiff_t block_idx = 0; block_idx < total_blocks; ++block_idx) {
-        const size_t i = block_idx / channels; // batch index
-        const size_t j = block_idx % channels; // channel index
-
-        const T *x_ptr_base;
-        T *y_ptr_base;
-
-        if (ndim > 2) { // (N, C, L)
-            x_ptr_base = x + i * info->x_strides[0] + j * info->x_strides[1];
-            y_ptr_base = y + i * info->y_strides[0] + j * info->y_strides[1];
-        } else { // (N, L)
-            x_ptr_base = x + i * info->x_strides[0];
-            y_ptr_base = y + i * info->y_strides[0];
-        }
-
-        for (size_t out_idx = 0; out_idx < output_length; ++out_idx) {
-            size_t start_index = (out_idx * input_length) / output_length;
-            size_t end_index = ((out_idx + 1) * input_length + output_length - 1) / output_length;
-
-            start_index = std::max(start_index, size_t(0));
-            end_index = std::min(end_index, input_length);
-            size_t window_len = end_index - start_index;
-
-            if (window_len <= 0) {
-                continue;
-            }
-
-            const T *window_ptr = x_ptr_base + start_index * x_stride_last;
-
-            auto max_val = op::common_cpu::reduce_op::max(window_ptr, window_len, x_stride_last);
-            y_ptr_base[out_idx] = utils::cast<T>(max_val);
-        }
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace, size_t workspace_size,
-    void *y, const void *x,
-    void *stream) const {
-
-    if (_info.atype == INFINI_DTYPE_F32) {
-        return adaptiveMaxPool1d(&_info, (float *)y, (const float *)x);
-    } else if (_info.atype == INFINI_DTYPE_F16) {
-        return adaptiveMaxPool1d(&_info, (fp16_t *)y, (const fp16_t *)x);
-    } else if (_info.atype == INFINI_DTYPE_BF16) {
-        return adaptiveMaxPool1d(&_info, (bf16_t *)y, (const bf16_t *)x);
-    } else if (_info.atype == INFINI_DTYPE_F64) {
-        return adaptiveMaxPool1d(&_info, (double *)y, (const double *)x);
-    }
-
-    return INFINI_STATUS_BAD_TENSOR_DTYPE;
-}
-
-} // namespace op::adaptive_max_pool1d::cpu
--- a/src/infiniop/ops/adaptive_max_pool1d/cpu/adaptive_max_pool1d_cpu.h
+++ b/src/infiniop/ops/adaptive_max_pool1d/cpu/adaptive_max_pool1d_cpu.h
-#ifndef __ADAPTIVE_MAX_POOL1D_CPU_H__
-#define __ADAPTIVE_MAX_POOL1D_CPU_H__
-
-#include "../adaptive_max_pool1d.h"
-
-DESCRIPTOR(cpu)
-
-#endif
--- a/src/infiniop/ops/adaptive_max_pool1d/cuda/kernel.cuh
+++ b/src/infiniop/ops/adaptive_max_pool1d/cuda/kernel.cuh
-#ifndef __ADAPTIVE_MAX_POOL1D_CUDA_KERNEL_H__
-#define __ADAPTIVE_MAX_POOL1D_CUDA_KERNEL_H__
-
-#include <cmath>
-#include <limits>
-
-template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
-__device__ void adaptiveMaxPool1dBlock(
-    Tdata *__restrict__ y,
-    ptrdiff_t stride_y_batch,
-    ptrdiff_t stride_y_channel,
-    const Tdata *__restrict__ x,
-    ptrdiff_t stride_x_batch,
-    ptrdiff_t stride_x_channel,
-    ptrdiff_t stride_x_length,
-    size_t channels,
-    size_t input_length,
-    size_t output_length,
-    size_t ndim) {
-
-    size_t block_idx = blockIdx.x;
-    size_t batch_idx = block_idx / channels;
-    size_t channel_idx = block_idx % channels;
-
-    const Tdata *x_ptr;
-    Tdata *y_ptr;
-
-    if (ndim > 2) {
-        x_ptr = x + batch_idx * stride_x_batch + channel_idx * stride_x_channel;
-        y_ptr = y + batch_idx * stride_y_batch + channel_idx * stride_y_channel;
-    } else {
-        x_ptr = x + batch_idx * stride_x_batch;
-        y_ptr = y + batch_idx * stride_y_batch;
-    }
-
-    for (size_t out_idx = threadIdx.x; out_idx < output_length; out_idx += BLOCK_SIZE) {
-        int start_index = static_cast<int>(floorf((float)out_idx * input_length / output_length));
-        int end_index = static_cast<int>(ceilf((float)(out_idx + 1) * input_length / output_length));
-
-        if (end_index <= start_index) {
-            continue;
-        }
-
-        Tcompute max_val = Tcompute(x_ptr[start_index * stride_x_length]);
-        for (int i = start_index + 1; i < end_index; ++i) {
-            Tcompute val = Tcompute(x_ptr[i * stride_x_length]);
-            max_val = max(max_val, val);
-        }
-
-        y_ptr[out_idx] = Tdata(max_val);
-    }
-}
-
-#endif
--- a/src/infiniop/ops/adaptive_max_pool1d/info.h
+++ b/src/infiniop/ops/adaptive_max_pool1d/info.h
-#ifndef __ADAPATIVE_MAX_POOL1D_H__
-#define __ADAPATIVE_MAX_POOL1D_H__
-
-#include "../../../utils.h"
-#include "../../tensor.h"
-#include <vector>
-
-namespace op::adaptive_max_pool1d {
-
-class AdaptiveMaxPool1dInfo {
-    AdaptiveMaxPool1dInfo() = default;
-
-public:
-    infiniDtype_t atype;
-    std::vector<size_t> shape;
-    std::vector<ptrdiff_t> y_strides;
-    std::vector<ptrdiff_t> x_strides;
-    size_t input_size;
-    size_t output_size;
-    size_t ndim() const { return shape.size(); }
-    size_t input_length() const { return input_size; }
-    size_t output_length() const { return output_size; }
-
-    static utils::Result<AdaptiveMaxPool1dInfo> create(
-        infiniopTensorDescriptor_t y_desc,
-        infiniopTensorDescriptor_t x_desc,
-        size_t output_size) {
-
-        auto atype = y_desc->dtype();
-        if (x_desc->dtype() != atype) {
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;
-        }
-        if (atype != INFINI_DTYPE_F16 && atype != INFINI_DTYPE_BF16 && atype != INFINI_DTYPE_F32 && atype != INFINI_DTYPE_F64) {
-            return INFINI_STATUS_BAD_TENSOR_DTYPE;
-        }
-
-        const size_t y_ndim = y_desc->ndim();
-        const size_t x_ndim = x_desc->ndim();
-
-        if (y_ndim != x_ndim) {
-            return INFINI_STATUS_BAD_TENSOR_SHAPE;
-        }
-
-        for (size_t i = 0; i < y_ndim - 1; ++i) {
-            if (x_desc->dim(i) != y_desc->dim(i)) {
-                return INFINI_STATUS_BAD_TENSOR_SHAPE;
-            }
-        }
-
-        if (y_desc->dim(y_ndim - 1) != output_size) {
-            return INFINI_STATUS_BAD_TENSOR_SHAPE;
-        }
-
-        return utils::Result<AdaptiveMaxPool1dInfo>(AdaptiveMaxPool1dInfo{
-            atype,
-            y_desc->shape(),
-            y_desc->strides(),
-            x_desc->strides(),
-            x_desc->dim(x_ndim - 1),
-            output_size});
-    }
-};
-} // namespace op::adaptive_max_pool1d
-
-#endif // __ADAPATIVE_MAX_POOL1D_H__
--- a/src/infiniop/ops/adaptive_max_pool1d/metax/adaptive_max_pool1d_metax.cuh
+++ b/src/infiniop/ops/adaptive_max_pool1d/metax/adaptive_max_pool1d_metax.cuh
-#ifndef __ADAPTIVE_MAX_POOL1D_METAX_CUH__
-#define __ADAPTIVE_MAX_POOL1D_METAX_CUH__
-
-#include "../adaptive_max_pool1d.h"
-
-DESCRIPTOR(metax)
-
-#endif
--- a/src/infiniop/ops/adaptive_max_pool1d/metax/adaptive_max_pool1d_metax.maca
+++ b/src/infiniop/ops/adaptive_max_pool1d/metax/adaptive_max_pool1d_metax.maca
-#include "../../../devices/metax/metax_common.h"
-#include "adaptive_max_pool1d_metax.cuh"
-
-#include "../../../devices/metax/metax_kernel_common.h"
-
-#include "../cuda/kernel.cuh"
-
-template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
-INFINIOP_METAX_KERNEL adaptiveMaxPool1dKernel(
-    Tdata *__restrict__ y,
-    ptrdiff_t stride_y_batch,
-    ptrdiff_t stride_y_channel,
-    const Tdata *__restrict__ x,
-    ptrdiff_t stride_x_batch,
-    ptrdiff_t stride_x_channel,
-    ptrdiff_t stride_x_length,
-    size_t channels,
-    size_t input_length,
-    size_t output_length,
-    size_t ndim) {
-
-    adaptiveMaxPool1dBlock<BLOCK_SIZE, Tdata, Tcompute>(
-        y, stride_y_batch, stride_y_channel,
-        x, stride_x_batch, stride_x_channel, stride_x_length,
-        channels, input_length, output_length,ndim);
-}
-
-namespace op::adaptive_max_pool1d::metax {
-
-struct Descriptor::Opaque {
-    std::shared_ptr<device::metax::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor(){
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc,
-    size_t output_size) {
-
-    auto result = AdaptiveMaxPool1dInfo::create(y_desc, x_desc, output_size);
-    CHECK_RESULT(result);
-    auto info = result.take();
-
-    *desc_ptr = new Descriptor(
-        new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
-        std::move(info),
-        0,
-        handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-template <unsigned int BLOCK_SIZE>
-infiniStatus_t launchKernel(
-    uint32_t numblock,
-    void *y, infiniDtype_t dtype,
-    ptrdiff_t stride_y_batch, ptrdiff_t stride_y_channel,
-    const void *x,
-    ptrdiff_t stride_x_batch, ptrdiff_t stride_x_channel, ptrdiff_t stride_x_length,
-    size_t channels, size_t input_length, size_t output_length, size_t ndim,
-    hcStream_t stream){
-
-#define LAUNCH_KERNEL(Tdata, Tcompute)                                                          \
-    adaptiveMaxPool1dKernel<BLOCK_SIZE, Tdata, Tcompute><<<numblock, BLOCK_SIZE, 0, stream>>> ( \
-        reinterpret_cast<Tdata *>(y),                                                           \
-        stride_y_batch, stride_y_channel,                                                       \
-        reinterpret_cast<const Tdata *>(x),                                                  \
-        stride_x_batch, stride_x_channel, stride_x_length,                                   \
-        channels, input_length, output_length, ndim)
-
-    if (dtype == INFINI_DTYPE_F16) {
-        LAUNCH_KERNEL(half, float);
-    } else if (dtype == INFINI_DTYPE_BF16) {
-        LAUNCH_KERNEL(__hpcc_bfloat16, float);
-    } else if (dtype == INFINI_DTYPE_F32) {
-        LAUNCH_KERNEL(float, float);
-    } else if (dtype == INFINI_DTYPE_F64) {
-        LAUNCH_KERNEL(double, double);
-    } else {
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-#undef LAUNCH_KERNEL
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace, size_t workspace_size,
-    void *y, const void *x,
-    void *stream_) const {
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    const size_t ndim = _info.ndim();
-    const size_t batch_size = _info.shape[0];
-    const size_t channels = ndim > 2 ? _info.shape[1] : 1;
-    const size_t input_length = _info.input_length();
-    const size_t output_length = _info.output_length();
-
-    ptrdiff_t stride_x_batch = _info.x_strides[0];
-    ptrdiff_t stride_x_channel = ndim > 2 ? _info.x_strides[1] : 0;
-    ptrdiff_t stride_x_length = _info.x_strides.back();
-
-    ptrdiff_t stride_y_batch = _info.y_strides[0];
-    ptrdiff_t stride_y_channel = ndim > 2 ? _info.y_strides[1] : 0;
-
-    uint32_t num_blocks = static_cast<uint32_t>(batch_size * channels);
-    auto stream = reinterpret_cast<hcStream_t>(stream_);
-
-    if (_opaque->internal->maxThreadsPerBlock() >= METAX_BLOCK_SIZE_1024) {
-        CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_1024>(
-            num_blocks, y, _info.atype,
-            stride_y_batch, stride_y_channel,
-            x, stride_x_batch, stride_x_channel, stride_x_length,
-            channels, input_length, output_length, ndim,
-            stream));
-    } else {
-        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace op::adaptive_max_pool1d::metax
--- a/src/infiniop/ops/adaptive_max_pool1d/moore/adaptive_max_pool1d_moore.h
+++ b/src/infiniop/ops/adaptive_max_pool1d/moore/adaptive_max_pool1d_moore.h
-#ifndef __ADAPTIVE_MAX_POOL1D_MOOORE_H__
-#define __ADAPTIVE_MAX_POOL1D_MOOORE_H__
-
-#include "../adaptive_max_pool1d.h"
-
-DESCRIPTOR(moore)
-
-#endif
--- a/src/infiniop/ops/adaptive_max_pool1d/moore/adaptive_max_pool1d_moore.mu
+++ b/src/infiniop/ops/adaptive_max_pool1d/moore/adaptive_max_pool1d_moore.mu
-#include "../../../devices/moore/moore_common.h"
-#include "adaptive_max_pool1d_moore.h"
-
-#include "../../../devices/moore/moore_kernel_common.h"
-
-#include "../cuda/kernel.cuh"
-
-template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
-INFINIOP_MOORE_KERNEL adaptiveMaxPool1dKernel(
-    Tdata *__restrict__ y,
-    ptrdiff_t stride_y_batch,
-    ptrdiff_t stride_y_channel,
-    const Tdata *__restrict__ x,
-    ptrdiff_t stride_x_batch,
-    ptrdiff_t stride_x_channel,
-    ptrdiff_t stride_x_length,
-    size_t channels,
-    size_t input_length,
-    size_t output_length,
-    size_t ndim){
-
-    adaptiveMaxPool1dBlock<BLOCK_SIZE, Tdata, Tcompute>(
-        y, stride_y_batch, stride_y_channel,
-        x, stride_x_batch, stride_x_channel, stride_x_length,
-        channels, input_length, output_length, ndim);
-}
-
-namespace op::adaptive_max_pool1d::moore {
-
-struct Descriptor::Opaque {
-    std::shared_ptr<device::moore::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor() {
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc,
-    size_t output_size) {
-    auto result = AdaptiveMaxPool1dInfo::create(y_desc, x_desc, output_size);
-    CHECK_RESULT(result);
-    auto info = result.take();
-
-    *desc_ptr = new Descriptor(
-        new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
-        std::move(info),
-        0,
-        handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-template <unsigned int BLOCK_SIZE>
-infiniStatus_t launchKernel(
-    uint32_t num_blocks,
-    void *y, infiniDtype_t dtype,
-    ptrdiff_t stride_y_batch, ptrdiff_t stride_y_channel,
-    const void *x,
-    ptrdiff_t stride_x_batch, ptrdiff_t stride_x_channel, ptrdiff_t stride_x_length,
-    size_t channels, size_t input_length, size_t output_length, size_t ndim,
-    musaStream_t musa_stream) {
-
-#define LAUNCH_KERNEL(Tdata, Tcompute)                                                       \
-    adaptiveMaxPool1dKernel<BLOCK_SIZE, Tdata, Tcompute><<<num_blocks, BLOCK_SIZE, 0, musa_stream>>>( \
-        reinterpret_cast<Tdata *>(y),                                                        \
-        stride_y_batch, stride_y_channel,                                                    \
-        reinterpret_cast<const Tdata *>(x),                                                  \
-        stride_x_batch, stride_x_channel, stride_x_length,                                   \
-        channels, input_length, output_length, ndim)
-
-    if (dtype == INFINI_DTYPE_F16) {
-        LAUNCH_KERNEL(half, float);
-    } else if (dtype == INFINI_DTYPE_BF16) {
-        LAUNCH_KERNEL(__mt_bfloat16, float);
-    } else if (dtype == INFINI_DTYPE_F32) {
-        LAUNCH_KERNEL(float, float);
-    } else if (dtype == INFINI_DTYPE_F64) {
-        LAUNCH_KERNEL(double, double);
-    } else {
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-#undef LAUNCH_KERNEL
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace, size_t workspace_size,
-    void *y, const void *x,
-    void *stream) const {
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    const size_t ndim = _info.ndim();
-    const size_t batch_size = _info.shape[0];
-    const size_t channels = ndim > 2 ? _info.shape[1] : 1;
-    const size_t input_length = _info.input_length();
-    const size_t output_length = _info.output_length();
-
-    ptrdiff_t stride_x_batch = _info.x_strides[0];
-    ptrdiff_t stride_x_channel = ndim > 2 ? _info.x_strides[1] : 0;
-    ptrdiff_t stride_x_length = _info.x_strides.back();
-
-    ptrdiff_t stride_y_batch = _info.y_strides[0];
-    ptrdiff_t stride_y_channel = ndim > 2 ? _info.y_strides[1] : 0;
-
-    uint32_t num_blocks = static_cast<uint32_t>(batch_size * channels);
-    auto musa_stream = reinterpret_cast<musaStream_t>(stream);
-
-    if (_opaque->internal->maxThreadsPerBlock() >= MOORE_BLOCK_SIZE_1024) {
-        CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_1024>(
-            num_blocks, y, _info.atype,
-            stride_y_batch, stride_y_channel,
-            x, stride_x_batch, stride_x_channel, stride_x_length,
-            channels, input_length, output_length, ndim,
-            musa_stream));
-    } else if (_opaque->internal->maxThreadsPerBlock() >= MOORE_BLOCK_SIZE_512) {
-        CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_512>(
-            num_blocks, y, _info.atype,
-            stride_y_batch, stride_y_channel,
-            x, stride_x_batch, stride_x_channel, stride_x_length,
-            channels, input_length, output_length, ndim,
-            musa_stream));
-    } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_2048) {
-        CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_2048>(
-            num_blocks, y, _info.atype,
-            stride_y_batch, stride_y_channel,
-            x, stride_x_batch, stride_x_channel, stride_x_length,
-            channels, input_length, output_length, ndim,
-            musa_stream));
-    } else {
-        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace op::adaptive_max_pool1d::moore
--- a/src/infiniop/ops/adaptive_max_pool1d/nvidia/adaptive_max_pool1d_nvidia.cu
+++ b/src/infiniop/ops/adaptive_max_pool1d/nvidia/adaptive_max_pool1d_nvidia.cu
-#include "../../../devices/nvidia/nvidia_common.cuh"
-#include "adaptive_max_pool1d_nvidia.cuh"
-
-#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
-
-#include "../cuda/kernel.cuh"
-
-template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
-INFINIOP_CUDA_KERNEL adaptiveMaxPool1dKernel(
-    Tdata *__restrict__ y,
-    ptrdiff_t stride_y_batch,
-    ptrdiff_t stride_y_channel,
-    const Tdata *__restrict__ x,
-    ptrdiff_t stride_x_batch,
-    ptrdiff_t stride_x_channel,
-    ptrdiff_t stride_x_length,
-    size_t channels,
-    size_t input_length,
-    size_t output_length,
-    size_t ndim) {
-
-    adaptiveMaxPool1dBlock<BLOCK_SIZE, Tdata, Tcompute>(
-        y, stride_y_batch, stride_y_channel,
-        x, stride_x_batch, stride_x_channel, stride_x_length,
-        channels, input_length, output_length, ndim);
-}
-
-namespace op::adaptive_max_pool1d::nvidia {
-
-struct Descriptor::Opaque {
-    std::shared_ptr<device::nvidia::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor() {
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc,
-    size_t output_size) {
-    auto result = AdaptiveMaxPool1dInfo::create(y_desc, x_desc, output_size);
-    CHECK_RESULT(result);
-    auto info = result.take();
-
-    *desc_ptr = new Descriptor(
-        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
-        std::move(info),
-        0,
-        handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-template <unsigned int BLOCK_SIZE>
-infiniStatus_t launchKernel(
-    uint32_t num_blocks,
-    void *y, infiniDtype_t dtype,
-    ptrdiff_t stride_y_batch, ptrdiff_t stride_y_channel,
-    const void *x,
-    ptrdiff_t stride_x_batch, ptrdiff_t stride_x_channel, ptrdiff_t stride_x_length,
-    size_t channels, size_t input_length, size_t output_length, size_t ndim,
-    cudaStream_t cuda_stream) {
-
-#define LAUNCH_KERNEL(Tdata, Tcompute)                                                                \
-    adaptiveMaxPool1dKernel<BLOCK_SIZE, Tdata, Tcompute><<<num_blocks, BLOCK_SIZE, 0, cuda_stream>>>( \
-        reinterpret_cast<Tdata *>(y),                                                                 \
-        stride_y_batch, stride_y_channel,                                                             \
-        reinterpret_cast<const Tdata *>(x),                                                           \
-        stride_x_batch, stride_x_channel, stride_x_length,                                            \
-        channels, input_length, output_length, ndim)
-
-    if (dtype == INFINI_DTYPE_F16) {
-        LAUNCH_KERNEL(half, float);
-    } else if (dtype == INFINI_DTYPE_BF16) {
-        LAUNCH_KERNEL(__nv_bfloat16, float);
-    } else if (dtype == INFINI_DTYPE_F32) {
-        LAUNCH_KERNEL(float, float);
-    } else if (dtype == INFINI_DTYPE_F64) {
-        LAUNCH_KERNEL(double, double);
-    } else {
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-#undef LAUNCH_KERNEL
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace, size_t workspace_size,
-    void *y, const void *x,
-    void *stream) const {
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    const size_t ndim = _info.ndim();
-    const size_t batch_size = _info.shape[0];
-    const size_t channels = ndim > 2 ? _info.shape[1] : 1;
-    const size_t input_length = _info.input_length();
-    const size_t output_length = _info.output_length();
-
-    ptrdiff_t stride_x_batch = _info.x_strides[0];
-    ptrdiff_t stride_x_channel = ndim > 2 ? _info.x_strides[1] : 0;
-    ptrdiff_t stride_x_length = _info.x_strides.back();
-
-    ptrdiff_t stride_y_batch = _info.y_strides[0];
-    ptrdiff_t stride_y_channel = ndim > 2 ? _info.y_strides[1] : 0;
-
-    uint32_t num_blocks = static_cast<uint32_t>(batch_size * channels);
-    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
-
-    if (_opaque->internal->maxThreadsPerBlock() >= CUDA_BLOCK_SIZE_1024) {
-        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_1024>(
-            num_blocks, y, _info.atype,
-            stride_y_batch, stride_y_channel,
-            x, stride_x_batch, stride_x_channel, stride_x_length,
-            channels, input_length, output_length, ndim,
-            cuda_stream));
-    } else if (_opaque->internal->maxThreadsPerBlock() >= CUDA_BLOCK_SIZE_512) {
-        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_512>(
-            num_blocks, y, _info.atype,
-            stride_y_batch, stride_y_channel,
-            x, stride_x_batch, stride_x_channel, stride_x_length,
-            channels, input_length, output_length, ndim,
-            cuda_stream));
-    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
-        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
-            num_blocks, y, _info.atype,
-            stride_y_batch, stride_y_channel,
-            x, stride_x_batch, stride_x_channel, stride_x_length,
-            channels, input_length, output_length, ndim,
-            cuda_stream));
-    } else {
-        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-} // namespace op::adaptive_max_pool1d::nvidia
--- a/src/infiniop/ops/adaptive_max_pool1d/nvidia/adaptive_max_pool1d_nvidia.cuh
+++ b/src/infiniop/ops/adaptive_max_pool1d/nvidia/adaptive_max_pool1d_nvidia.cuh
-#ifndef __ADAPTIVE_MAX_POOL1D_CUDA_H__
-#define __ADAPTIVE_MAX_POOL1D_CUDA_H__
-
-#include "../adaptive_max_pool1d.h"
-
-DESCRIPTOR(nvidia)
-
-#endif
--- a/src/infiniop/ops/adaptive_max_pool1d/operator.cc
+++ b/src/infiniop/ops/adaptive_max_pool1d/operator.cc
-#include "../../operator.h"
-#include "../../handle.h"
-#include "infiniop/ops/adaptive_max_pool1d.h"
-
-#ifdef ENABLE_CPU_API
-#include "cpu/adaptive_max_pool1d_cpu.h"
-#endif
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
-#include "nvidia/adaptive_max_pool1d_nvidia.cuh"
-#endif
-#ifdef ENABLE_METAX_API
-#include "metax/adaptive_max_pool1d_metax.cuh"
-#endif
-#ifdef ENABLE_MOORE_API
-#include "moore/adaptive_max_pool1d_moore.h"
-#endif
-
-__INFINI_C infiniStatus_t infiniopCreateAdaptiveMaxPool1dDescriptor(
-    infiniopHandle_t handle,
-    infiniopAdaptiveMaxPool1dDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc,
-    size_t output_size) {
-
-#define CREATE(CASE, NAMESPACE)                                                            \
-    case CASE:                                                                             \
-        return op::adaptive_max_pool1d::NAMESPACE::Descriptor::create(                     \
-            handle,                                                                        \
-            reinterpret_cast<op::adaptive_max_pool1d::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                                        \
-            x_desc,                                                                        \
-            output_size)
-
-    switch (handle->device) {
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_MOORE_API
-        CREATE(INFINI_DEVICE_MOORE, moore);
-#endif
-    }
-#undef CREATE
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__INFINI_C infiniStatus_t infiniopGetAdaptiveMaxPool1dWorkspaceSize(
-    infiniopAdaptiveMaxPool1dDescriptor_t desc,
-    size_t *size) {
-#define GET(CASE, NAMESPACE)                                                                               \
-    case CASE:                                                                                             \
-        *size = reinterpret_cast<op::adaptive_max_pool1d::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_MOORE_API
-        GET(INFINI_DEVICE_MOORE, moore);
-#endif
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__INFINI_C infiniStatus_t infiniopAdaptiveMaxPool1d(
-    infiniopAdaptiveMaxPool1dDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-#define CALCULATE(CASE, NAMESPACE)                                                                  \
-    case CASE:                                                                                      \
-        return reinterpret_cast<op::adaptive_max_pool1d::NAMESPACE::Descriptor *>(desc)->calculate( \
-            workspace, workspace_size, y, x, stream);
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_MOORE_API
-        CALCULATE(INFINI_DEVICE_MOORE, moore);
-#endif
-    }
-#undef CALCULATE
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__INFINI_C infiniStatus_t infiniopDestroyAdaptiveMaxPool1dDescriptor(
-    infiniopAdaptiveMaxPool1dDescriptor_t desc) {
-#define DESTROY(CASE, NAMESPACE)                                                         \
-    case CASE:                                                                           \
-        delete reinterpret_cast<op::adaptive_max_pool1d::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        DESTROY(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DESTROY(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        DESTROY(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_MOORE_API
-        DESTROY(INFINI_DEVICE_MOORE, moore);
-#endif
-    }
-#undef DESTROY
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
--- a/src/infiniop/ops/asinh/cpu/asinh_cpu.cc
+++ b/src/infiniop/ops/asinh/cpu/asinh_cpu.cc
-#include "asinh_cpu.h"
-
-namespace op::asinh::cpu {
-
-Descriptor::~Descriptor() = default;
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<AsinhOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<AsinhOp, float>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F64:
-        return _device_info->calculate<AsinhOp, double>(_info, output, inputs, stream);
-    case INFINI_DTYPE_BF16:
-        return _device_info->calculate<AsinhOp, bf16_t>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-}
-
-} // namespace op::asinh::cpu
--- a/src/infiniop/ops/asinh/cpu/asinh_cpu.h
+++ b/src/infiniop/ops/asinh/cpu/asinh_cpu.h
-#ifndef __ASINH_CPU_H__
-#define __ASINH_CPU_H__
-
-#include <cmath>
-
-#include "../../../elementwise/cpu/elementwise_cpu.h"
-
-ELEMENTWISE_DESCRIPTOR(asinh, cpu)
-
-namespace op::asinh::cpu {
-typedef struct AsinhOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return std::asinh(x);
-    }
-} AsinhOp;
-} // namespace op::asinh::cpu
-
-#endif // __ASINH_CPU_H__
--- a/src/infiniop/ops/asinh/cuda/kernel.cuh
+++ b/src/infiniop/ops/asinh/cuda/kernel.cuh
-#ifndef __ASINH_CUDA_KERNEL_H__
-#define __ASINH_CUDA_KERNEL_H__
-
-namespace op::asinh::cuda {
-
-typedef struct AsinhOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-
-        if constexpr (std::is_same_v<T, half>) {
-            float x_f = __half2float(x);
-            return __float2half(asinhf(x_f));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            float x_f = __bfloat162float(x);
-            return __float2bfloat16(asinhf(x_f));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return asinhf(x);
-        } else {
-            return ::asinh(x);
-        }
-    }
-
-} AsinhOp;
-
-} // namespace op::asinh::cuda
-
-#endif // __ASINH_CUDA_KERNEL_H__
--- a/src/infiniop/ops/asinh/metax/asinh.maca
+++ b/src/infiniop/ops/asinh/metax/asinh.maca
-#include "asinh_metax.h"
-#include "../../../elementwise/metax/elementwise_metax.h"
-
-#include "../cuda/kernel.cuh"
-
-namespace op::asinh::metax {
-
-Descriptor::~Descriptor() = default;
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::AsinhOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, cuda::AsinhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::AsinhOp, float>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F64:
-        return _device_info->calculate<256, cuda::AsinhOp, double>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-}
-
-} // namespace op::asinh::metax
--- a/src/infiniop/ops/asinh/metax/asinh_metax.h
+++ b/src/infiniop/ops/asinh/metax/asinh_metax.h
-#ifndef __ASINH_METAX_API_H__
-#define __ASINH_METAX_API_H__
-
-#include "../../../elementwise/metax/elementwise_metax_api.h"
-
-ELEMENTWISE_DESCRIPTOR(asinh, metax)
-
-#endif // __ASINH_METAX_API_H__