Merge pull request #1071 from InfiniTensor/issue/1031_T1-1-9

【算子比赛2025秋】T1-1-9

Merge pull request #1071 from InfiniTensor/issue/1031_T1-1-9
【算子比赛2025秋】T1-1-9
09d4b2ae · thatPepe · GitHub · 5fc85c8b · 85f8987c · 09d4b2ae
Unverified Commit 09d4b2ae authored Mar 12, 2026 by thatPepe Committed by GitHub Mar 12, 2026
20 changed files
--- a/src/infiniop/ops/adaptive_max_pool1d/nvidia/adaptive_max_pool1d_nvidia.cu
+++ b/src/infiniop/ops/adaptive_max_pool1d/nvidia/adaptive_max_pool1d_nvidia.cu
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "adaptive_max_pool1d_nvidia.cuh"
+
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+
+#include "../cuda/kernel.cuh"
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+INFINIOP_CUDA_KERNEL adaptiveMaxPool1dKernel(
+    Tdata *__restrict__ y,
+    ptrdiff_t stride_y_batch,
+    ptrdiff_t stride_y_channel,
+    const Tdata *__restrict__ x,
+    ptrdiff_t stride_x_batch,
+    ptrdiff_t stride_x_channel,
+    ptrdiff_t stride_x_length,
+    size_t channels,
+    size_t input_length,
+    size_t output_length,
+    size_t ndim) {
+
+    adaptiveMaxPool1dBlock<BLOCK_SIZE, Tdata, Tcompute>(
+        y, stride_y_batch, stride_y_channel,
+        x, stride_x_batch, stride_x_channel, stride_x_length,
+        channels, input_length, output_length, ndim);
+}
+
+namespace op::adaptive_max_pool1d::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t output_size) {
+    auto result = AdaptiveMaxPool1dInfo::create(y_desc, x_desc, output_size);
+    CHECK_RESULT(result);
+    auto info = result.take();
+
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
+        std::move(info),
+        0,
+        handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchKernel(
+    uint32_t num_blocks,
+    void *y, infiniDtype_t dtype,
+    ptrdiff_t stride_y_batch, ptrdiff_t stride_y_channel,
+    const void *x,
+    ptrdiff_t stride_x_batch, ptrdiff_t stride_x_channel, ptrdiff_t stride_x_length,
+    size_t channels, size_t input_length, size_t output_length, size_t ndim,
+    cudaStream_t cuda_stream) {
+
+#define LAUNCH_KERNEL(Tdata, Tcompute)                                                                \
+    adaptiveMaxPool1dKernel<BLOCK_SIZE, Tdata, Tcompute><<<num_blocks, BLOCK_SIZE, 0, cuda_stream>>>( \
+        reinterpret_cast<Tdata *>(y),                                                                 \
+        stride_y_batch, stride_y_channel,                                                             \
+        reinterpret_cast<const Tdata *>(x),                                                           \
+        stride_x_batch, stride_x_channel, stride_x_length,                                            \
+        channels, input_length, output_length, ndim)
+
+    if (dtype == INFINI_DTYPE_F16) {
+        LAUNCH_KERNEL(half, float);
+    } else if (dtype == INFINI_DTYPE_BF16) {
+        LAUNCH_KERNEL(__nv_bfloat16, float);
+    } else if (dtype == INFINI_DTYPE_F32) {
+        LAUNCH_KERNEL(float, float);
+    } else if (dtype == INFINI_DTYPE_F64) {
+        LAUNCH_KERNEL(double, double);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+#undef LAUNCH_KERNEL
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size,
+    void *y, const void *x,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    const size_t ndim = _info.ndim();
+    const size_t batch_size = _info.shape[0];
+    const size_t channels = ndim > 2 ? _info.shape[1] : 1;
+    const size_t input_length = _info.input_length();
+    const size_t output_length = _info.output_length();
+
+    ptrdiff_t stride_x_batch = _info.x_strides[0];
+    ptrdiff_t stride_x_channel = ndim > 2 ? _info.x_strides[1] : 0;
+    ptrdiff_t stride_x_length = _info.x_strides.back();
+
+    ptrdiff_t stride_y_batch = _info.y_strides[0];
+    ptrdiff_t stride_y_channel = ndim > 2 ? _info.y_strides[1] : 0;
+
+    uint32_t num_blocks = static_cast<uint32_t>(batch_size * channels);
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+    if (_opaque->internal->maxThreadsPerBlock() >= CUDA_BLOCK_SIZE_1024) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_1024>(
+            num_blocks, y, _info.atype,
+            stride_y_batch, stride_y_channel,
+            x, stride_x_batch, stride_x_channel, stride_x_length,
+            channels, input_length, output_length, ndim,
+            cuda_stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() >= CUDA_BLOCK_SIZE_512) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_512>(
+            num_blocks, y, _info.atype,
+            stride_y_batch, stride_y_channel,
+            x, stride_x_batch, stride_x_channel, stride_x_length,
+            channels, input_length, output_length, ndim,
+            cuda_stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
+            num_blocks, y, _info.atype,
+            stride_y_batch, stride_y_channel,
+            x, stride_x_batch, stride_x_channel, stride_x_length,
+            channels, input_length, output_length, ndim,
+            cuda_stream));
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::adaptive_max_pool1d::nvidia
--- a/src/infiniop/ops/adaptive_max_pool1d/nvidia/adaptive_max_pool1d_nvidia.cuh
+++ b/src/infiniop/ops/adaptive_max_pool1d/nvidia/adaptive_max_pool1d_nvidia.cuh
+#ifndef __ADAPTIVE_MAX_POOL1D_CUDA_H__
+#define __ADAPTIVE_MAX_POOL1D_CUDA_H__
+
+#include "../adaptive_max_pool1d.h"
+
+DESCRIPTOR(nvidia)
+
+#endif
--- a/src/infiniop/ops/adaptive_max_pool1d/operator.cc
+++ b/src/infiniop/ops/adaptive_max_pool1d/operator.cc
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/adaptive_max_pool1d.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/adaptive_max_pool1d_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/adaptive_max_pool1d_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/adaptive_max_pool1d_metax.cuh"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/adaptive_max_pool1d_moore.h"
+#endif
+
+__INFINI_C infiniStatus_t infiniopCreateAdaptiveMaxPool1dDescriptor(
+    infiniopHandle_t handle,
+    infiniopAdaptiveMaxPool1dDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t output_size) {
+
+#define CREATE(CASE, NAMESPACE)                                                            \
+    case CASE:                                                                             \
+        return op::adaptive_max_pool1d::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                        \
+            reinterpret_cast<op::adaptive_max_pool1d::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                                        \
+            x_desc,                                                                        \
+            output_size)
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+    }
+#undef CREATE
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__INFINI_C infiniStatus_t infiniopGetAdaptiveMaxPool1dWorkspaceSize(
+    infiniopAdaptiveMaxPool1dDescriptor_t desc,
+    size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                               \
+    case CASE:                                                                                             \
+        *size = reinterpret_cast<op::adaptive_max_pool1d::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__INFINI_C infiniStatus_t infiniopAdaptiveMaxPool1d(
+    infiniopAdaptiveMaxPool1dDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+#define CALCULATE(CASE, NAMESPACE)                                                                  \
+    case CASE:                                                                                      \
+        return reinterpret_cast<op::adaptive_max_pool1d::NAMESPACE::Descriptor *>(desc)->calculate( \
+            workspace, workspace_size, y, x, stream);
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+    }
+#undef CALCULATE
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__INFINI_C infiniStatus_t infiniopDestroyAdaptiveMaxPool1dDescriptor(
+    infiniopAdaptiveMaxPool1dDescriptor_t desc) {
+#define DESTROY(CASE, NAMESPACE)                                                         \
+    case CASE:                                                                           \
+        delete reinterpret_cast<op::adaptive_max_pool1d::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DESTROY(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DESTROY(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DESTROY(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        DESTROY(INFINI_DEVICE_MOORE, moore);
+#endif
+    }
+#undef DESTROY
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
--- a/src/infiniop/ops/asinh/cpu/asinh_cpu.cc
+++ b/src/infiniop/ops/asinh/cpu/asinh_cpu.cc
+#include "asinh_cpu.h"
+
+namespace op::asinh::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<AsinhOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<AsinhOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<AsinhOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<AsinhOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+} // namespace op::asinh::cpu
--- a/src/infiniop/ops/asinh/cpu/asinh_cpu.h
+++ b/src/infiniop/ops/asinh/cpu/asinh_cpu.h
+#ifndef __ASINH_CPU_H__
+#define __ASINH_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(asinh, cpu)
+
+namespace op::asinh::cpu {
+typedef struct AsinhOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::asinh(x);
+    }
+} AsinhOp;
+} // namespace op::asinh::cpu
+
+#endif // __ASINH_CPU_H__
--- a/src/infiniop/ops/asinh/cuda/kernel.cuh
+++ b/src/infiniop/ops/asinh/cuda/kernel.cuh
+#ifndef __ASINH_CUDA_KERNEL_H__
+#define __ASINH_CUDA_KERNEL_H__
+
+namespace op::asinh::cuda {
+
+typedef struct AsinhOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+
+        if constexpr (std::is_same_v<T, half>) {
+            float x_f = __half2float(x);
+            return __float2half(asinhf(x_f));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float x_f = __bfloat162float(x);
+            return __float2bfloat16(asinhf(x_f));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return asinhf(x);
+        } else {
+            return ::asinh(x);
+        }
+    }
+
+} AsinhOp;
+
+} // namespace op::asinh::cuda
+
+#endif // __ASINH_CUDA_KERNEL_H__
--- a/src/infiniop/ops/asinh/metax/asinh.maca
+++ b/src/infiniop/ops/asinh/metax/asinh.maca
+#include "../../../elementwise/metax/elementwise_metax.h"
+#include "asinh_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::asinh::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::AsinhOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::AsinhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::AsinhOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::AsinhOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+} // namespace op::asinh::metax
--- a/src/infiniop/ops/asinh/metax/asinh_metax.h
+++ b/src/infiniop/ops/asinh/metax/asinh_metax.h
+#ifndef __ASINH_METAX_API_H__
+#define __ASINH_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(asinh, metax)
+
+#endif // __ASINH_METAX_API_H__
--- a/src/infiniop/ops/asinh/moore/asinh_moore.h
+++ b/src/infiniop/ops/asinh/moore/asinh_moore.h
+#ifndef __ASINH_MOORE_API_H__
+#define __ASINH_MOORE_API_H__
+
+#include "../../../elementwise/moore/elementwise_moore_api.h"
+
+ELEMENTWISE_DESCRIPTOR(asinh, moore)
+
+#endif // __ASINH_MOORE_API_H__
--- a/src/infiniop/ops/asinh/moore/asinh_moore.mu
+++ b/src/infiniop/ops/asinh/moore/asinh_moore.mu
+#include "asinh_moore.h"
+
+#include "../../../elementwise/moore/elementwise_moore.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::asinh::moore {
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create MOORE elementwise descriptor
+    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::AsinhOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::AsinhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::AsinhOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::AsinhOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::asinh::moore
--- a/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu
+++ b/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "asinh_nvidia.cuh"
+
+namespace op::asinh::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::AsinhOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::AsinhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::AsinhOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::AsinhOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+} // namespace op::asinh::nvidia
--- a/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cuh
+++ b/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cuh
+#ifndef __ASINH_NVIDIA_API_H__
+#define __ASINH_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(asinh, nvidia)
+
+#endif // __ASINH_NVIDIA_API_H
--- a/src/infiniop/ops/asinh/operator.cc
+++ b/src/infiniop/ops/asinh/operator.cc
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/asinh.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/asinh_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/asinh_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/asinh_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/asinh_moore.h"
+#endif
+
+__INFINI_C infiniStatus_t infiniopCreateAsinhDescriptor(
+    infiniopHandle_t handle,
+    infiniopAsinhDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+#define CREATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                               \
+        return op::asinh::NAMESPACE::Descriptor::create(                     \
+            handle,                                                          \
+            reinterpret_cast<op::asinh::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                          \
+            {x_desc})
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CREATE
+}
+
+__INFINI_C infiniStatus_t infiniopGetAsinhWorkspaceSize(infiniopAsinhDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                       \
+    case CASE:                                                                                     \
+        *size = reinterpret_cast<const op::asinh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__INFINI_C infiniStatus_t infiniopAsinh(infiniopAsinhDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream) {
+#define CALCULATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                                  \
+        return reinterpret_cast<const op::asinh::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream);
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CALCULATE
+}
+
+__INFINI_C infiniStatus_t infiniopDestroyAsinhDescriptor(infiniopAsinhDescriptor_t desc) {
+#define DESTROY(CASE, NAMESPACE)                                           \
+    case CASE:                                                             \
+        delete reinterpret_cast<op::asinh::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DESTROY(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DESTROY(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DESTROY(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        DESTROY(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef DESTROY
+}
--- a/src/infiniop/ops/fmod/cpu/fmod_cpu.cc
+++ b/src/infiniop/ops/fmod/cpu/fmod_cpu.cc
+#include "fmod_cpu.h"
+
+namespace op::fmod__::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &out_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(out_shape, a_shape, b_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<FmodOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<FmodOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<FmodOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<FmodOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::fmod__::cpu
--- a/src/infiniop/ops/fmod/cpu/fmod_cpu.h
+++ b/src/infiniop/ops/fmod/cpu/fmod_cpu.h
+#ifndef _FMOD_CPU_H__
+#define _FMOD_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(fmod__, cpu)
+
+namespace op::fmod__::cpu {
+typedef struct FmodOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    T operator()(const T &a, const T &b) const {
+        return std::fmod(a, b);
+    }
+} FmodOp;
+} // namespace op::fmod__::cpu
+
+#endif // _FMOD_CPU_H__
--- a/src/infiniop/ops/fmod/cuda/kernel.cuh
+++ b/src/infiniop/ops/fmod/cuda/kernel.cuh
+#ifndef __FMOD_CUDA_H__
+#define __FMOD_CUDA_H__
+
+namespace op::fmod__::cuda {
+typedef struct FmodOp {
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        // fmod(a, b) = a - b * trunc(a / b)
+        if constexpr (std::is_same_v<T, half2>) {
+            // 对于 half2，转换为 float 计算后再转回
+            float2 af = __half22float2(a);
+            float2 bf = __half22float2(b);
+            float2 result;
+            result.x = fmodf(af.x, bf.x);
+            result.y = fmodf(af.y, bf.y);
+            return __float22half2_rn(result);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            // 对于 bfloat162，转换为 float 计算后再转回
+            float af_low = __bfloat162float(__low2bfloat16(a));
+            float af_high = __bfloat162float(__high2bfloat16(a));
+            float bf_low = __bfloat162float(__low2bfloat16(b));
+            float bf_high = __bfloat162float(__high2bfloat16(b));
+            return __floats2bfloat162_rn(fmodf(af_low, bf_low), fmodf(af_high, bf_high));
+        } else if constexpr (std::is_same_v<T, half>) {
+            // 对于 half，转换为 float 计算后再转回
+            float af = __half2float(a);
+            float bf = __half2float(b);
+            return __float2half(fmodf(af, bf));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            // 对于 bfloat16，转换为 float 计算后再转回
+            float af = __bfloat162float(a);
+            float bf = __bfloat162float(b);
+            return __float2bfloat16(fmodf(af, bf));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return fmodf(a, b);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return fmod(a, b);
+        } else {
+            // 整数类型使用 % 运算符
+            return a % b;
+        }
+    }
+} FmodOp;
+
+} // namespace op::fmod__::cuda
+
+#endif // __FMOD_CUDA_H__
--- a/src/infiniop/ops/fmod/metax/fmod_metax.h
+++ b/src/infiniop/ops/fmod/metax/fmod_metax.h
+#ifndef __FMOD_METAX_API_H__
+#define __FMOD_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(fmod__, metax)
+
+#endif // __FMOD_METAX_API_H__
--- a/src/infiniop/ops/fmod/metax/mul_metax.maca
+++ b/src/infiniop/ops/fmod/metax/mul_metax.maca
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+#include "fmod_metax.h"
+
+namespace op::fmod__::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::FmodOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::FmodOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::FmodOp, double>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::FmodOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::fmod__::metax
--- a/src/infiniop/ops/fmod/moore/fmod_moore.h
+++ b/src/infiniop/ops/fmod/moore/fmod_moore.h
+#ifndef __FMOD_MOORE_API_H__
+#define __FMOD_MOORE_API_H__
+
+#include "../../../elementwise/moore/elementwise_moore_api.h"
+
+ELEMENTWISE_DESCRIPTOR(fmod__, moore)
+
+#endif // __FMOD_MOORE_API_H__
--- a/src/infiniop/ops/fmod/moore/fmod_moore.mu
+++ b/src/infiniop/ops/fmod/moore/fmod_moore.mu
+#include "fmod_moore.h"
+
+#include "../../../elementwise/moore/elementwise_moore.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::fmod__::moore {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create MOORE elementwise descriptor
+    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::FmodOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::FmodOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::FmodOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::FmodOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::fmod__::moore