Revert "Merge pull request #1056 from InfiniTensor/issue/1031"

This reverts commit 7f295448, reversing changes made to e60985dc.

Revert "Merge pull request #1056 from InfiniTensor/issue/1031"
This reverts commit 7f295448, reversing changes made to e60985dc.
cb7f0b7d · wooway777 · 037140c0 · 037140c0 · 037140c0 · 037140c0
Commit cb7f0b7d authored Mar 11, 2026 by wooway777
20 changed files
--- a/src/infiniop/ops/asinh/moore/asinh_moore.h
+++ b/src/infiniop/ops/asinh/moore/asinh_moore.h
-#ifndef __ASINH_MOORE_API_H__
-#define __ASINH_MOORE_API_H__
-
-#include "../../../elementwise/moore/elementwise_moore_api.h"
-
-ELEMENTWISE_DESCRIPTOR(asinh, moore)
-
-#endif // __ASINH_MOORE_API_H__
--- a/src/infiniop/ops/asinh/moore/asinh_moore.mu
+++ b/src/infiniop/ops/asinh/moore/asinh_moore.mu
-#include "asinh_moore.h"
-
-#include "../../../elementwise/moore/elementwise_moore.h"
-
-#include "../cuda/kernel.cuh"
-
-namespace op::asinh::moore {
-Descriptor::~Descriptor() = default;
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create MOORE elementwise descriptor
-    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::AsinhOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, cuda::AsinhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::AsinhOp, float>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F64:
-        return _device_info->calculate<256, cuda::AsinhOp, double>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-} // namespace op::asinh::moore
--- a/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu
+++ b/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu
-#include "../../../devices/nvidia/nvidia_common.cuh"
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
-
-#include "../cuda/kernel.cuh"
-#include "asinh_nvidia.cuh"
-
-namespace op::asinh::nvidia {
-
-Descriptor::~Descriptor() = default;
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::AsinhOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, cuda::AsinhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::AsinhOp, float>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F64:
-        return _device_info->calculate<256, cuda::AsinhOp, double>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-}
-
-} // namespace op::asinh::nvidia
--- a/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cuh
+++ b/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cuh
-#ifndef __ASINH_NVIDIA_API_H__
-#define __ASINH_NVIDIA_API_H__
-
-#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
-
-ELEMENTWISE_DESCRIPTOR(asinh, nvidia)
-
-#endif // __ASINH_NVIDIA_API_H
--- a/src/infiniop/ops/asinh/operator.cc
+++ b/src/infiniop/ops/asinh/operator.cc
-#include "../../operator.h"
-#include "../../handle.h"
-#include "infiniop/ops/asinh.h"
-
-#ifdef ENABLE_CPU_API
-#include "cpu/asinh_cpu.h"
-#endif
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
-#include "nvidia/asinh_nvidia.cuh"
-#endif
-#ifdef ENABLE_METAX_API
-#include "metax/asinh_metax.h"
-#endif
-#ifdef ENABLE_MOORE_API
-#include "moore/asinh_moore.h"
-#endif
-
-__INFINI_C infiniStatus_t infiniopCreateAsinhDescriptor(
-    infiniopHandle_t handle,
-    infiniopAsinhDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-#define CREATE(CASE, NAMESPACE)                                              \
-    case CASE:                                                               \
-        return op::asinh::NAMESPACE::Descriptor::create(                     \
-            handle,                                                          \
-            reinterpret_cast<op::asinh::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                          \
-            {x_desc})
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_MOORE_API
-        CREATE(INFINI_DEVICE_MOORE, moore);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef CREATE
-}
-
-__INFINI_C infiniStatus_t infiniopGetAsinhWorkspaceSize(infiniopAsinhDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                       \
-    case CASE:                                                                                     \
-        *size = reinterpret_cast<const op::asinh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_MOORE_API
-        GET(INFINI_DEVICE_MOORE, moore);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__INFINI_C infiniStatus_t infiniopAsinh(infiniopAsinhDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream) {
-#define CALCULATE(CASE, NAMESPACE)                                              \
-    case CASE:                                                                  \
-        return reinterpret_cast<const op::asinh::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream);
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_MOORE_API
-        CALCULATE(INFINI_DEVICE_MOORE, moore);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef CALCULATE
-}
-
-__INFINI_C infiniStatus_t infiniopDestroyAsinhDescriptor(infiniopAsinhDescriptor_t desc) {
-#define DESTROY(CASE, NAMESPACE)                                           \
-    case CASE:                                                             \
-        delete reinterpret_cast<op::asinh::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        DESTROY(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DESTROY(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        DESTROY(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_MOORE_API
-        DESTROY(INFINI_DEVICE_MOORE, moore);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef DESTROY
-}
--- a/src/infiniop/ops/avg_pool1d/avg_pool1d.h
+++ b/src/infiniop/ops/avg_pool1d/avg_pool1d.h
-#ifndef __AVG_POOL1D_H__
-#define __AVG_POOL1D_H__
-
-#include "../../../utils.h"
-#include "../../operator.h"
-#include "../../tensor.h"
-#include "infiniop/ops/avg_pool1d.h"
-
-#define DESCRIPTOR(NAMESPACE)                                    \
-    namespace op::avg_pool1d::NAMESPACE {                        \
-    class Descriptor final : public InfiniopDescriptor {         \
-        struct Opaque;                                           \
-        Opaque *_opaque;                                         \
-        AvgPool1dInfo _info;                                     \
-        size_t _workspace_size;                                  \
-                                                                 \
-        Descriptor(                                              \
-            AvgPool1dInfo info,                                  \
-            size_t workspace_size_,                              \
-            Opaque *opaque,                                      \
-            infiniDevice_t device_type,                          \
-            int device_id)                                       \
-            : InfiniopDescriptor{device_type, device_id},        \
-              _opaque(opaque),                                   \
-              _info(info),                                       \
-              _workspace_size(workspace_size_) {}                \
-                                                                 \
-    public:                                                      \
-        ~Descriptor();                                           \
-                                                                 \
-        size_t workspaceSize() const { return _workspace_size; } \
-                                                                 \
-        static infiniStatus_t create(                            \
-            infiniopHandle_t handle,                             \
-            Descriptor **desc_ptr,                               \
-            infiniopTensorDescriptor_t y_desc,                   \
-            infiniopTensorDescriptor_t x_desc,                   \
-            size_t kernel_size,                                  \
-            size_t stride,                                       \
-            size_t padding);                                     \
-                                                                 \
-        infiniStatus_t calculate(                                \
-            void *workspace,                                     \
-            size_t workspace_size,                               \
-            void *y,                                             \
-            const void *x,                                       \
-            void *stream) const;                                 \
-    };                                                           \
-    }
-
-class AvgPool1dInfo {
-private:
-    AvgPool1dInfo() = default;
-
-public:
-    infiniDtype_t dtype;
-    size_t batch, channels, in_width, out_width;
-    size_t kernel_size, stride, padding;
-
-    ptrdiff_t y_stride_batch, y_stride_channel, y_stride_width;
-    ptrdiff_t x_stride_batch, x_stride_channel, x_stride_width;
-
-    static utils::Result<AvgPool1dInfo> createAvgPool1dInfo(
-        infiniopTensorDescriptor_t y_desc,
-        infiniopTensorDescriptor_t x_desc,
-        size_t kernel_size,
-        size_t stride,
-        size_t padding) {
-
-        CHECK_OR_RETURN(y_desc != nullptr && x_desc != nullptr, INFINI_STATUS_NULL_POINTER);
-
-        const infiniDtype_t dtype = y_desc->dtype();
-        CHECK_OR_RETURN(dtype == x_desc->dtype(), INFINI_STATUS_BAD_TENSOR_DTYPE);
-        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
-
-        CHECK_OR_RETURN(y_desc->ndim() == 3 && x_desc->ndim() == 3, INFINI_STATUS_BAD_TENSOR_SHAPE);
-
-        size_t batch = x_desc->dim(0);
-        size_t channels = x_desc->dim(1);
-        size_t in_width = x_desc->dim(2);
-
-        CHECK_OR_RETURN(y_desc->dim(0) == batch, INFINI_STATUS_BAD_TENSOR_SHAPE);
-        CHECK_OR_RETURN(y_desc->dim(1) == channels, INFINI_STATUS_BAD_TENSOR_SHAPE);
-
-        size_t padded_len = in_width + 2 * padding;
-
-        CHECK_OR_RETURN(padded_len >= kernel_size, INFINI_STATUS_BAD_TENSOR_SHAPE);
-
-        size_t expected_out_width = (padded_len - kernel_size) / stride + 1;
-        CHECK_OR_RETURN(y_desc->dim(2) == expected_out_width, INFINI_STATUS_BAD_TENSOR_SHAPE);
-
-        size_t out_width = expected_out_width;
-
-        return utils::Result<AvgPool1dInfo>(AvgPool1dInfo{
-            dtype,
-            batch, channels, in_width, out_width,
-            kernel_size, stride, padding,
-            y_desc->stride(0), y_desc->stride(1), y_desc->stride(2),
-            x_desc->stride(0), x_desc->stride(1), x_desc->stride(2)});
-    }
-};
-
-#endif
--- a/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.cc
+++ b/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.cc
-#include "avg_pool1d_cpu.h"
-#include "../../../devices/cpu/common_cpu.h"
-#include <algorithm>
-
-namespace op::avg_pool1d::cpu {
-
-Descriptor::~Descriptor() = default;
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc,
-    size_t kernel_size,
-    size_t stride,
-    size_t padding) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-
-    auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding);
-    CHECK_RESULT(info);
-
-    *desc_ptr = new Descriptor(
-        info.take(),
-        0,
-        nullptr,
-        handle->device,
-        handle->device_id);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-template <typename T>
-infiniStatus_t calculateAvgPool1d(const AvgPool1dInfo &info,
-                                  T *y,
-                                  const T *x) {
-    const float inv_kernel = 1.0f / static_cast<float>(info.kernel_size);
-
-#pragma omp parallel for
-    for (ptrdiff_t bc = 0; bc < ptrdiff_t(info.batch * info.channels); ++bc) {
-
-        ptrdiff_t b = bc / info.channels;
-        ptrdiff_t c = bc % info.channels;
-
-        size_t y_base = b * info.y_stride_batch + c * info.y_stride_channel;
-        size_t x_base = b * info.x_stride_batch + c * info.x_stride_channel;
-
-        for (size_t ow = 0; ow < info.out_width; ++ow) {
-            size_t y_offset = y_base + ow * info.y_stride_width;
-
-            long long start_w = static_cast<long long>(ow * info.stride) - info.padding;
-            long long end_w = start_w + info.kernel_size;
-
-            long long valid_start = std::max(0LL, start_w);
-            long long valid_end = std::min(static_cast<long long>(info.in_width), end_w);
-
-            float sum = 0.0f;
-            for (long long iw = valid_start; iw < valid_end; ++iw) {
-                size_t x_offset = x_base + iw * info.x_stride_width;
-                sum += utils::cast<float>(x[x_offset]);
-            }
-
-            const float avg = sum * inv_kernel;
-            y[y_offset] = utils::cast<T>(avg);
-        }
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-#define CALCULATE(TDATA) calculateAvgPool1d(_info, (TDATA *)y, (const TDATA *)x)
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) const {
-
-    switch (_info.dtype) {
-    case INFINI_DTYPE_F16:
-        return CALCULATE(fp16_t);
-    case INFINI_DTYPE_BF16:
-        return CALCULATE(bf16_t);
-    case INFINI_DTYPE_F32:
-        return CALCULATE(float);
-    case INFINI_DTYPE_F64:
-        return CALCULATE(double);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-}
-
-#undef CALCULATE
-
-} // namespace op::avg_pool1d::cpu
--- a/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.h
+++ b/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.h
-#ifndef __INFINIOP_AVG_POOL1D_CPU_H__
-#define __INFINIOP_AVG_POOL1D_CPU_H__
-
-#include "../avg_pool1d.h"
-
-DESCRIPTOR(cpu)
-
-#endif
--- a/src/infiniop/ops/avg_pool1d/cuda/kernel.cuh
+++ b/src/infiniop/ops/avg_pool1d/cuda/kernel.cuh
-#ifndef __INFINIOP_AVG_POOL1D_CUDA_KERNEL_CUH__
-#define __INFINIOP_AVG_POOL1D_CUDA_KERNEL_CUH__
-
-template <typename T>
-__device__ void avgPool1dKernel(
-    T *y,
-    const T *x,
-    size_t batch,
-    size_t channels,
-    size_t in_width,
-    size_t out_width,
-    size_t kernel_size,
-    size_t stride,
-    size_t padding,
-
-    ptrdiff_t y_stride_batch,
-    ptrdiff_t y_stride_channel,
-    ptrdiff_t y_stride_width,
-    ptrdiff_t x_stride_batch,
-    ptrdiff_t x_stride_channel,
-    ptrdiff_t x_stride_width) {
-
-    size_t total_elements = batch * channels * out_width;
-
-    for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-         idx < total_elements;
-         idx += gridDim.x * blockDim.x) {
-
-        size_t ow = idx % out_width;
-        size_t temp = idx / out_width;
-        size_t c = temp % channels;
-        size_t b = temp / channels;
-
-        size_t y_offset = b * y_stride_batch + c * y_stride_channel + ow * y_stride_width;
-
-        long long start_w = static_cast<long long>(ow * stride) - padding;
-
-        T sum = 0;
-
-        for (size_t k = 0; k < kernel_size; ++k) {
-            long long iw = start_w + k;
-
-            if (iw >= 0 && iw < static_cast<long long>(in_width)) {
-                size_t x_offset = b * x_stride_batch + c * x_stride_channel + iw * x_stride_width;
-                sum += x[x_offset];
-            }
-        }
-
-#if defined(ENABLE_ILUVATAR_API)
-        // Iluvatar __half doesn't accept size_t directly.
-        y[y_offset] = sum / static_cast<T>(static_cast<double>(kernel_size));
-#else
-        y[y_offset] = sum / static_cast<T>(kernel_size);
-#endif
-    }
-}
-
-#endif
--- a/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.h
+++ b/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.h
-#ifndef __INFINIOP_AVG_POOL1D_METAX_H__
-#define __INFINIOP_AVG_POOL1D_METAX_H__
-
-#include "../avg_pool1d.h"
-
-DESCRIPTOR(metax)
-
-#endif // __INFINIOP_AVG_POOL1D_METAX_H__
--- a/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.maca
+++ b/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.maca
-#include "../../../devices/metax/metax_common.h"
-#include "avg_pool1d_metax.h"
-#include "../../../devices/metax/metax_kernel_common.h"
-
-#include <type_traits>
-
-namespace op::avg_pool1d::metax {
-
-struct Descriptor::Opaque {
-    std::shared_ptr<device::metax::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor() {
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc,
-    size_t kernel_size,
-    size_t stride,
-    size_t padding) {
-
-    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
-
-    auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding);
-    CHECK_RESULT(info);
-
-    *desc_ptr = new Descriptor(
-        info.take(),
-        0,
-        new Opaque{handle->internal()},
-        handle->device,
-        handle->device_id);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-template <typename Tdata, typename Tcompute>
-__device__ __forceinline__ Tdata castToOutput(Tcompute val) {
-    if constexpr (std::is_same_v<Tdata, half>) {
-        return __float2half(static_cast<float>(val));
-    } else if constexpr (std::is_same_v<Tdata, cuda_bfloat16>) {
-        return __float2bfloat16(static_cast<float>(val));
-    } else {
-        return static_cast<Tdata>(val);
-    }
-}
-
-template <typename Tdata, typename Tcompute>
-INFINIOP_METAX_KERNEL avgPool1dGlobalKernel(
-    Tdata *y,
-    const Tdata *x,
-    size_t batch,
-    size_t channels,
-    size_t in_width,
-    size_t out_width,
-    size_t kernel_size,
-    size_t stride,
-    size_t padding,
-    ptrdiff_t y_stride_batch,
-    ptrdiff_t y_stride_channel,
-    ptrdiff_t y_stride_width,
-    ptrdiff_t x_stride_batch,
-    ptrdiff_t x_stride_channel,
-    ptrdiff_t x_stride_width) {
-
-    size_t total_elements = batch * channels * out_width;
-    Tcompute inv_kernel = Tcompute(1) / static_cast<Tcompute>(kernel_size);
-
-    for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-         idx < total_elements;
-         idx += gridDim.x * blockDim.x) {
-
-        size_t ow = idx % out_width;
-        size_t temp = idx / out_width;
-        size_t c = temp % channels;
-        size_t b = temp / channels;
-
-        size_t y_offset = b * y_stride_batch + c * y_stride_channel + ow * y_stride_width;
-        size_t x_base = b * x_stride_batch + c * x_stride_channel;
-
-        long long start_w = static_cast<long long>(ow * stride) - static_cast<long long>(padding);
-        long long end_w = start_w + static_cast<long long>(kernel_size);
-        long long iw_start = start_w < 0 ? 0 : start_w;
-        long long iw_end = end_w > static_cast<long long>(in_width) ? static_cast<long long>(in_width) : end_w;
-
-        Tcompute sum = Tcompute(0);
-        if (iw_start < iw_end) {
-            size_t x_offset = x_base + static_cast<size_t>(iw_start) * x_stride_width;
-            for (long long iw = iw_start; iw < iw_end; ++iw) {
-                sum += static_cast<Tcompute>(x[x_offset]);
-                x_offset += x_stride_width;
-            }
-        }
-
-        y[y_offset] = castToOutput<Tdata, Tcompute>(sum * inv_kernel);
-    }
-}
-
-template <typename Tdata, typename Tcompute>
-infiniStatus_t calculateAvgPool1d(
-    const AvgPool1dInfo &info,
-    int max_threads_per_block,
-    Tdata *y,
-    const Tdata *x,
-    hcStream_t stream) {
-
-    size_t total_elements = info.batch * info.channels * info.out_width;
-
-    int block_size = 256;
-    if (max_threads_per_block > 0 && max_threads_per_block < block_size) {
-        block_size = max_threads_per_block;
-    }
-
-    size_t grid_size = (total_elements + block_size - 1) / block_size;
-    if (grid_size > 65535) {
-        grid_size = 65535;
-    }
-
-    avgPool1dGlobalKernel<Tdata, Tcompute><<<grid_size, block_size, 0, stream>>>(
-        y, x,
-        info.batch, info.channels, info.in_width, info.out_width,
-        info.kernel_size, info.stride, info.padding,
-        info.y_stride_batch, info.y_stride_channel, info.y_stride_width,
-        info.x_stride_batch, info.x_stride_channel, info.x_stride_width);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-#define CALCULATE(TDATA, TCOMPUTE) \
-    calculateAvgPool1d<TDATA, TCOMPUTE>( \
-        _info, \
-        _opaque->internal->maxThreadsPerBlock(), \
-        (TDATA *)y, \
-        (const TDATA *)x, \
-        (hcStream_t)stream)
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) const {
-
-    (void)workspace;
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_info.dtype) {
-    case INFINI_DTYPE_F16:
-        return CALCULATE(half, float);
-    case INFINI_DTYPE_BF16:
-        return CALCULATE(cuda_bfloat16, float);
-    case INFINI_DTYPE_F32:
-        return CALCULATE(float, float);
-    case INFINI_DTYPE_F64:
-        return CALCULATE(double, double);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-}
-
-#undef CALCULATE
-
-} // namespace op::avg_pool1d::metax
--- a/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_kernel.h
+++ b/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_kernel.h
-#ifndef __INFINIOP_AVG_POOL1D_MOORE_KERNEL_H__
-#define __INFINIOP_AVG_POOL1D_MOORE_KERNEL_H__
-
-#include <type_traits>
-
-namespace op::avg_pool1d::moore {
-
-template <typename Tdata, typename Tcompute>
-__device__ __forceinline__ Tdata castToOutput(Tcompute val) {
-    if constexpr (std::is_same_v<Tdata, half>) {
-        return __float2half(static_cast<float>(val));
-    } else if constexpr (std::is_same_v<Tdata, cuda_bfloat16>) {
-        return __float2bfloat16_rn(static_cast<float>(val));
-    } else {
-        return static_cast<Tdata>(val);
-    }
-}
-
-template <typename Tdata, typename Tcompute>
-__device__ void avgPool1dKernel(
-    Tdata *y,
-    const Tdata *x,
-    size_t batch,
-    size_t channels,
-    size_t in_width,
-    size_t out_width,
-    size_t kernel_size,
-    size_t stride,
-    size_t padding,
-    ptrdiff_t y_stride_batch,
-    ptrdiff_t y_stride_channel,
-    ptrdiff_t y_stride_width,
-    ptrdiff_t x_stride_batch,
-    ptrdiff_t x_stride_channel,
-    ptrdiff_t x_stride_width) {
-
-    size_t total_elements = batch * channels * out_width;
-    Tcompute inv_kernel = Tcompute(1) / static_cast<Tcompute>(kernel_size);
-
-    for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-         idx < total_elements;
-         idx += gridDim.x * blockDim.x) {
-
-        size_t ow = idx % out_width;
-        size_t temp = idx / out_width;
-        size_t c = temp % channels;
-        size_t b = temp / channels;
-
-        size_t y_offset = b * y_stride_batch + c * y_stride_channel + ow * y_stride_width;
-        size_t x_base = b * x_stride_batch + c * x_stride_channel;
-
-        long long start_w = static_cast<long long>(ow * stride) - static_cast<long long>(padding);
-        long long end_w = start_w + static_cast<long long>(kernel_size);
-        long long iw_start = start_w < 0 ? 0 : start_w;
-        long long iw_end = end_w > static_cast<long long>(in_width) ? static_cast<long long>(in_width) : end_w;
-
-        Tcompute sum = Tcompute(0);
-        if (iw_start < iw_end) {
-            size_t x_offset = x_base + static_cast<size_t>(iw_start) * x_stride_width;
-            for (long long iw = iw_start; iw < iw_end; ++iw) {
-                sum += static_cast<Tcompute>(x[x_offset]);
-                x_offset += x_stride_width;
-            }
-        }
-
-        y[y_offset] = castToOutput<Tdata, Tcompute>(sum * inv_kernel);
-    }
-}
-
-} // namespace op::avg_pool1d::moore
-
-#endif // __INFINIOP_AVG_POOL1D_MOORE_KERNEL_H__
--- a/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.h
+++ b/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.h
-#ifndef __INFINIOP_AVG_POOL1D_MOORE_H__
-#define __INFINIOP_AVG_POOL1D_MOORE_H__
-
-#include "../avg_pool1d.h"
-
-DESCRIPTOR(moore)
-
-#endif // __INFINIOP_AVG_POOL1D_MOORE_H__
--- a/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.mu
+++ b/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.mu
-#include "../../../devices/moore/moore_common.h"
-#include "avg_pool1d_moore.h"
-
-#include "../../../devices/moore/moore_kernel_common.h"
-
-#include "avg_pool1d_kernel.h"
-
-namespace op::avg_pool1d::moore {
-
-struct Descriptor::Opaque {
-    std::shared_ptr<device::moore::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor() {
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc,
-    size_t kernel_size,
-    size_t stride,
-    size_t padding) {
-
-    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
-
-    auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding);
-    CHECK_RESULT(info);
-
-    *desc_ptr = new Descriptor(
-        info.take(),
-        0,
-        new Opaque{handle->internal()},
-        handle->device,
-        handle->device_id);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-template <typename Tdata, typename Tcompute>
-INFINIOP_MOORE_KERNEL avgPool1dGlobalKernel(
-    Tdata *y,
-    const Tdata *x,
-    size_t batch,
-    size_t channels,
-    size_t in_width,
-    size_t out_width,
-    size_t kernel_size,
-    size_t stride,
-    size_t padding,
-    ptrdiff_t y_stride_batch,
-    ptrdiff_t y_stride_channel,
-    ptrdiff_t y_stride_width,
-    ptrdiff_t x_stride_batch,
-    ptrdiff_t x_stride_channel,
-    ptrdiff_t x_stride_width) {
-
-    avgPool1dKernel<Tdata, Tcompute>(
-        y, x,
-        batch, channels, in_width, out_width,
-        kernel_size, stride, padding,
-        y_stride_batch, y_stride_channel, y_stride_width,
-        x_stride_batch, x_stride_channel, x_stride_width);
-}
-
-template <typename Tdata, typename Tcompute>
-infiniStatus_t calculateAvgPool1d(
-    const AvgPool1dInfo &info,
-    int max_threads_per_block,
-    Tdata *y,
-    const Tdata *x,
-    musaStream_t stream) {
-
-    size_t total_elements = info.batch * info.channels * info.out_width;
-
-    int block_size = 256;
-    if (max_threads_per_block > 0 && max_threads_per_block < block_size) {
-        block_size = max_threads_per_block;
-    }
-
-    size_t grid_size = (total_elements + block_size - 1) / block_size;
-    if (grid_size > 65535) {
-        grid_size = 65535;
-    }
-
-    avgPool1dGlobalKernel<Tdata, Tcompute><<<grid_size, block_size, 0, stream>>>(
-        y, x,
-        info.batch, info.channels, info.in_width, info.out_width,
-        info.kernel_size, info.stride, info.padding,
-        info.y_stride_batch, info.y_stride_channel, info.y_stride_width,
-        info.x_stride_batch, info.x_stride_channel, info.x_stride_width);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-#define CALCULATE(TDATA, TCOMPUTE) \
-    calculateAvgPool1d<TDATA, TCOMPUTE>(\
-        _info,\
-        _opaque->internal->maxThreadsPerBlock(),\
-        (TDATA *)y,\
-        (const TDATA *)x,\
-        (musaStream_t)stream)
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) const {
-
-    (void)workspace;
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_info.dtype) {
-    case INFINI_DTYPE_F16:
-        return CALCULATE(half, float);
-    case INFINI_DTYPE_BF16:
-        return CALCULATE(cuda_bfloat16, float);
-    case INFINI_DTYPE_F32:
-        return CALCULATE(float, float);
-    case INFINI_DTYPE_F64:
-        return CALCULATE(double, double);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-}
-
-#undef CALCULATE
-
-} // namespace op::avg_pool1d::moore
--- a/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cu
+++ b/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cu
-#include "../../../devices/nvidia/nvidia_common.cuh"
-#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
-#include "../cuda/kernel.cuh"
-#include "avg_pool1d_nvidia.cuh"
-
-template <typename T>
-__global__ void avgPool1dGlobalKernel(
-    T *y,
-    const T *x,
-    size_t batch,
-    size_t channels,
-    size_t in_width,
-    size_t out_width,
-    size_t kernel_size,
-    size_t stride,
-    size_t padding,
-    ptrdiff_t y_stride_batch,
-    ptrdiff_t y_stride_channel,
-    ptrdiff_t y_stride_width,
-    ptrdiff_t x_stride_batch,
-    ptrdiff_t x_stride_channel,
-    ptrdiff_t x_stride_width) {
-
-    avgPool1dKernel<T>(
-        y, x,
-        batch, channels, in_width, out_width,
-        kernel_size, stride, padding,
-        y_stride_batch, y_stride_channel, y_stride_width,
-        x_stride_batch, x_stride_channel, x_stride_width);
-}
-
-namespace op::avg_pool1d::nvidia {
-
-struct Descriptor::Opaque {
-    std::shared_ptr<device::nvidia::Handle::Internal> internal;
-};
-
-Descriptor::~Descriptor() {
-    delete _opaque;
-}
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc,
-    size_t kernel_size,
-    size_t stride,
-    size_t padding) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-
-    auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding);
-    CHECK_RESULT(info);
-
-    *desc_ptr = new Descriptor(
-        info.take(),
-        0,
-        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
-        handle->device,
-        handle->device_id);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-template <typename T>
-infiniStatus_t calculateAvgPool1d(
-    const AvgPool1dInfo &info,
-    int max_threads_per_block,
-    T *y,
-    const T *x,
-    cudaStream_t stream) {
-
-    size_t total_elements = info.batch * info.channels * info.out_width;
-
-    int block_size = 256;
-    if (max_threads_per_block > 0 && max_threads_per_block < 256) {
-        block_size = max_threads_per_block;
-    }
-
-    size_t grid_size = (total_elements + block_size - 1) / block_size;
-    if (grid_size > 65535) {
-        grid_size = 65535;
-    }
-
-    avgPool1dGlobalKernel<T><<<grid_size, block_size, 0, stream>>>(
-        y, x,
-        info.batch, info.channels, info.in_width, info.out_width,
-        info.kernel_size, info.stride, info.padding,
-        info.y_stride_batch, info.y_stride_channel, info.y_stride_width,
-        info.x_stride_batch, info.x_stride_channel, info.x_stride_width);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-#define CALCULATE(TDATA)                                        \
-    calculateAvgPool1d(_info,                                   \
-                       _opaque->internal->maxThreadsPerBlock(), \
-                       (TDATA *)y,                              \
-                       (const TDATA *)x,                        \
-                       (cudaStream_t)stream)
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) const {
-
-    switch (_info.dtype) {
-    case INFINI_DTYPE_F16:
-        return CALCULATE(half);
-    case INFINI_DTYPE_BF16:
-        return CALCULATE(cuda_bfloat16);
-    case INFINI_DTYPE_F32:
-        return CALCULATE(float);
-    case INFINI_DTYPE_F64:
-        return CALCULATE(double);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-}
-
-#undef CALCULATE
-
-} // namespace op::avg_pool1d::nvidia
--- a/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cuh
+++ b/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cuh
-#ifndef __INFINIOP_AVG_POOL1D_CUDA_H__
-#define __INFINIOP_AVG_POOL1D_CUDA_H__
-
-#include "../avg_pool1d.h"
-
-DESCRIPTOR(nvidia)
-
-#endif
--- a/src/infiniop/ops/avg_pool1d/operator.cc
+++ b/src/infiniop/ops/avg_pool1d/operator.cc
-#include "../../operator.h"
-#include "../../handle.h"
-#include "infiniop/ops/avg_pool1d.h"
-
-#ifdef ENABLE_CPU_API
-#include "cpu/avg_pool1d_cpu.h"
-#endif
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
-#include "nvidia/avg_pool1d_nvidia.cuh"
-#endif
-#ifdef ENABLE_ASCEND_API
-#include "ascend/avg_pool1d_ascend.h"
-#endif
-#ifdef ENABLE_CAMBRICON_API
-#include "bang/avg_pool1d_bang.h"
-#endif
-#ifdef ENABLE_METAX_API
-#include "metax/avg_pool1d_metax.h"
-#endif
-#ifdef ENABLE_KUNLUN_API
-#include "kunlun/avg_pool1d_kunlun.h"
-#endif
-#ifdef ENABLE_MOORE_API
-#include "moore/avg_pool1d_moore.h"
-#endif
-
-__INFINI_C infiniStatus_t infiniopCreateAvgPool1dDescriptor(
-    infiniopHandle_t handle,
-    infiniopAvgPool1dDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y,
-    infiniopTensorDescriptor_t x,
-    size_t kernel_size,
-    size_t stride,
-    size_t padding) {
-
-#define CREATE(CASE, NAMESPACE)                                                   \
-    case CASE:                                                                    \
-        return op::avg_pool1d::NAMESPACE::Descriptor::create(                     \
-            handle,                                                               \
-            reinterpret_cast<op::avg_pool1d::NAMESPACE::Descriptor **>(desc_ptr), \
-            y,                                                                    \
-            x,                                                                    \
-            kernel_size,                                                          \
-            stride,                                                               \
-            padding)
-
-    switch (handle->device) {
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_HYGON_API
-        CREATE(INFINI_DEVICE_HYGON, nvidia);
-#endif
-#ifdef ENABLE_MOORE_API
-        CREATE(INFINI_DEVICE_MOORE, moore);
-#endif
-#ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_ASCEND_API
-        CREATE(INFINI_DEVICE_ASCEND, ascend);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        CREATE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__INFINI_C infiniStatus_t infiniopGetAvgPool1dWorkspaceSize(infiniopAvgPool1dDescriptor_t desc,
-                                                            size_t *size) {
-#define GET(CASE, NAMESPACE)                                                                            \
-    case CASE:                                                                                          \
-        *size = reinterpret_cast<const op::avg_pool1d::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_HYGON_API
-        GET(INFINI_DEVICE_HYGON, nvidia);
-#endif
-#ifdef ENABLE_MOORE_API
-        GET(INFINI_DEVICE_MOORE, moore);
-#endif
-#ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        GET(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        GET(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_ASCEND_API
-        GET(INFINI_DEVICE_ASCEND, ascend);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef GET
-}
-
-__INFINI_C infiniStatus_t infiniopAvgPool1d(
-    infiniopAvgPool1dDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                                   \
-    case CASE:                                                                       \
-        return reinterpret_cast<const op::avg_pool1d::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, x, stream)
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_HYGON_API
-        CALCULATE(INFINI_DEVICE_HYGON, nvidia);
-#endif
-#ifdef ENABLE_MOORE_API
-        CALCULATE(INFINI_DEVICE_MOORE, moore);
-#endif
-#ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_ASCEND_API
-        CALCULATE(INFINI_DEVICE_ASCEND, ascend);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__INFINI_C infiniStatus_t
-infiniopDestroyAvgPool1dDescriptor(infiniopAvgPool1dDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                       \
-    case CASE:                                                                        \
-        delete reinterpret_cast<const op::avg_pool1d::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_HYGON_API
-        DELETE(INFINI_DEVICE_HYGON, nvidia);
-#endif
-#ifdef ENABLE_MOORE_API
-        DELETE(INFINI_DEVICE_MOORE, moore);
-#endif
-#ifdef ENABLE_METAX_API
-        DELETE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        DELETE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_ASCEND_API
-        DELETE(INFINI_DEVICE_ASCEND, ascend);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
--- a/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.cc
+++ b/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.cc
-#include "cross_entropy_cpu.h"
-#include "../../../devices/cpu/common_cpu.h"
-#include "../../../reduce/cpu/reduce.h"
-#include <algorithm>
-#include <cmath>
-
-namespace op::cross_entropy::cpu {
-
-Descriptor::~Descriptor() = default;
-
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc,
-    infiniopTensorDescriptor_t target_desc) {
-
-    auto x_dtype = x_desc->dtype();
-    auto t_dtype = target_desc->dtype();
-
-    CHECK_DTYPE(x_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
-    CHECK_DTYPE(t_dtype, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
-
-    CrossEntropyInfo info{};
-    info.dtype = x_dtype;
-    info.target_dtype = t_dtype;
-
-    info.outer_size = target_desc->numel();
-
-    info.vocab_size = x_desc->shape().back();
-
-    info.x_stride = static_cast<ptrdiff_t>(info.vocab_size);
-
-    *desc_ptr = new Descriptor(nullptr, info, 0, handle->device, handle->device_id);
-    return INFINI_STATUS_SUCCESS;
-}
-
-template <typename T, typename Tidx>
-infiniStatus_t cross_entropy_kernel(const CrossEntropyInfo *info,
-                                    T *y, const T *x, const void *target) {
-    const Tidx *label = reinterpret_cast<const Tidx *>(target);
-
-#pragma omp parallel for
-    for (ptrdiff_t i = 0; i < ptrdiff_t(info->outer_size); ++i) {
-        const T *row = x + i * info->x_stride;
-        Tidx idx = label[i];
-
-        if (idx < 0 || static_cast<size_t>(idx) >= info->vocab_size) {
-            y[i] = utils::cast<T>(0.f);
-            continue;
-        }
-
-        float max_val = op::common_cpu::reduce_op::max(row, info->vocab_size, 1);
-
-        float sum_exp = 0.f;
-        for (size_t j = 0; j < info->vocab_size; ++j) {
-            sum_exp += std::exp(utils::cast<float>(row[j]) - max_val);
-        }
-
-        float log_term = std::log(sum_exp) + max_val;
-        float target_logit = utils::cast<float>(row[idx]);
-        y[i] = utils::cast<T>(log_term - target_logit);
-    }
-    return INFINI_STATUS_SUCCESS;
-}
-
-template <typename T>
-infiniStatus_t dispatch_target_type(const CrossEntropyInfo *info,
-                                    T *y, const T *x, const void *target) {
-
-    if (info->target_dtype == INFINI_DTYPE_I32) {
-        return cross_entropy_kernel<T, int32_t>(info, y, x, target);
-    } else if (info->target_dtype == INFINI_DTYPE_I64) {
-        return cross_entropy_kernel<T, int64_t>(info, y, x, target);
-    }
-    return INFINI_STATUS_BAD_TENSOR_DTYPE;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    const void *target,
-    void *stream) const {
-
-    switch (_info.dtype) {
-    case INFINI_DTYPE_F16:
-        return dispatch_target_type(&_info, (fp16_t *)y, (const fp16_t *)x, target);
-    case INFINI_DTYPE_BF16:
-        return dispatch_target_type(&_info, (bf16_t *)y, (const bf16_t *)x, target);
-    case INFINI_DTYPE_F32:
-        return dispatch_target_type(&_info, (float *)y, (const float *)x, target);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-}
-
-} // namespace op::cross_entropy::cpu
--- a/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.h
+++ b/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.h
-#ifndef __CROSS_ENTROPY_CPU_H__
-#define __CROSS_ENTROPY_CPU_H__
-
-#include "../cross_entropy.h"
-
-DESCRIPTOR(cpu)
-
-#endif
--- a/src/infiniop/ops/cross_entropy/cross_entropy.h
+++ b/src/infiniop/ops/cross_entropy/cross_entropy.h
-#ifndef CROSS_ENTROPY_H
-#define CROSS_ENTROPY_H
-
-#include "../../operator.h"
-#include "info.h"
-
-#define DESCRIPTOR(NAMESPACE)                                                 \
-    namespace op::cross_entropy::NAMESPACE {                                  \
-    class Descriptor final : public InfiniopDescriptor {                      \
-        struct Opaque;                                                        \
-        Opaque *_opaque;                                                      \
-        CrossEntropyInfo _info;                                               \
-        size_t _workspace_size;                                               \
-                                                                              \
-        Descriptor(Opaque *opaque,                                            \
-                   CrossEntropyInfo info,                                     \
-                   size_t workspace_size,                                     \
-                   infiniDevice_t device_type,                                \
-                   int device_id)                                             \
-            : InfiniopDescriptor{device_type, device_id},                     \
-              _opaque(opaque),                                                \
-              _info(info),                                                    \
-              _workspace_size(workspace_size) {}                              \
-                                                                              \
-    public:                                                                   \
-        ~Descriptor();                                                        \
-        size_t workspaceSize() const { return _workspace_size; }              \
-        static infiniStatus_t create(infiniopHandle_t handle,                 \
-                                     Descriptor **desc_ptr,                   \
-                                     infiniopTensorDescriptor_t y_desc,       \
-                                     infiniopTensorDescriptor_t x_desc,       \
-                                     infiniopTensorDescriptor_t target_desc); \
-        infiniStatus_t calculate(void *workspace,                             \
-                                 size_t workspace_size,                       \
-                                 void *y,                                     \
-                                 const void *x,                               \
-                                 const void *target,                          \
-                                 void *stream) const;                         \
-    };                                                                        \
-    }
-
-#endif