issue/1031 T1-1-17

45a3794b · wooway777 · cb7f0b7d · 45a3794b · 45a3794b · 45a3794b
Commit 45a3794b authored Mar 11, 2026 by wooway777
20 changed files
--- a/src/infiniop/ops/avg_pool1d/cuda/kernel.cuh
+++ b/src/infiniop/ops/avg_pool1d/cuda/kernel.cuh
+#ifndef __INFINIOP_AVG_POOL1D_CUDA_KERNEL_CUH__
+#define __INFINIOP_AVG_POOL1D_CUDA_KERNEL_CUH__
+template <typename T>
+__device__ void avgPool1dKernel(
+    T *y,
+    const T *x,
+    size_t batch,
+    size_t channels,
+    size_t in_width,
+    size_t out_width,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding,
+    ptrdiff_t y_stride_batch,
+    ptrdiff_t y_stride_channel,
+    ptrdiff_t y_stride_width,
+    ptrdiff_t x_stride_batch,
+    ptrdiff_t x_stride_channel,
+    ptrdiff_t x_stride_width) {
+    size_t total_elements = batch * channels * out_width;
+    for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+         idx < total_elements;
+         idx += gridDim.x * blockDim.x) {
+        size_t ow = idx % out_width;
+        size_t temp = idx / out_width;
+        size_t c = temp % channels;
+        size_t b = temp / channels;
+        size_t y_offset = b * y_stride_batch + c * y_stride_channel + ow * y_stride_width;
+        long long start_w = static_cast<long long>(ow * stride) - padding;
+        T sum = 0;
+        for (size_t k = 0; k < kernel_size; ++k) {
+            long long iw = start_w + k;
+            if (iw >= 0 && iw < static_cast<long long>(in_width)) {
+                size_t x_offset = b * x_stride_batch + c * x_stride_channel + iw * x_stride_width;
+                sum += x[x_offset];
+            }
+        }
+#if defined(ENABLE_ILUVATAR_API)
+        // Iluvatar __half doesn't accept size_t directly.
+        y[y_offset] = sum / static_cast<T>(static_cast<double>(kernel_size));
+#else
+        y[y_offset] = sum / static_cast<T>(kernel_size);
+#endif
+    }
+}
+#endif
--- a/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.h
+++ b/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.h
+#ifndef __INFINIOP_AVG_POOL1D_METAX_H__
+#define __INFINIOP_AVG_POOL1D_METAX_H__
+#include "../avg_pool1d.h"
+DESCRIPTOR(metax)
+#endif // __INFINIOP_AVG_POOL1D_METAX_H__
--- a/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.maca
+++ b/src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.maca
+#include "../../../devices/metax/metax_common.h"
+#include "avg_pool1d_metax.h"
+#include "../../../devices/metax/metax_kernel_common.h"
+#include <type_traits>
+namespace op::avg_pool1d::metax {
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding) {
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding);
+    CHECK_RESULT(info);
+    *desc_ptr = new Descriptor(
+        info.take(),
+        0,
+        new Opaque{handle->internal()},
+        handle->device,
+        handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+template <typename Tdata, typename Tcompute>
+__device__ __forceinline__ Tdata castToOutput(Tcompute val) {
+    if constexpr (std::is_same_v<Tdata, half>) {
+        return __float2half(static_cast<float>(val));
+    } else if constexpr (std::is_same_v<Tdata, cuda_bfloat16>) {
+        return __float2bfloat16(static_cast<float>(val));
+    } else {
+        return static_cast<Tdata>(val);
+    }
+}
+template <typename Tdata, typename Tcompute>
+INFINIOP_METAX_KERNEL avgPool1dGlobalKernel(
+    Tdata *y,
+    const Tdata *x,
+    size_t batch,
+    size_t channels,
+    size_t in_width,
+    size_t out_width,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding,
+    ptrdiff_t y_stride_batch,
+    ptrdiff_t y_stride_channel,
+    ptrdiff_t y_stride_width,
+    ptrdiff_t x_stride_batch,
+    ptrdiff_t x_stride_channel,
+    ptrdiff_t x_stride_width) {
+    size_t total_elements = batch * channels * out_width;
+    Tcompute inv_kernel = Tcompute(1) / static_cast<Tcompute>(kernel_size);
+    for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+         idx < total_elements;
+         idx += gridDim.x * blockDim.x) {
+        size_t ow = idx % out_width;
+        size_t temp = idx / out_width;
+        size_t c = temp % channels;
+        size_t b = temp / channels;
+        size_t y_offset = b * y_stride_batch + c * y_stride_channel + ow * y_stride_width;
+        size_t x_base = b * x_stride_batch + c * x_stride_channel;
+        long long start_w = static_cast<long long>(ow * stride) - static_cast<long long>(padding);
+        long long end_w = start_w + static_cast<long long>(kernel_size);
+        long long iw_start = start_w < 0 ? 0 : start_w;
+        long long iw_end = end_w > static_cast<long long>(in_width) ? static_cast<long long>(in_width) : end_w;
+        Tcompute sum = Tcompute(0);
+        if (iw_start < iw_end) {
+            size_t x_offset = x_base + static_cast<size_t>(iw_start) * x_stride_width;
+            for (long long iw = iw_start; iw < iw_end; ++iw) {
+                sum += static_cast<Tcompute>(x[x_offset]);
+                x_offset += x_stride_width;
+            }
+        }
+        y[y_offset] = castToOutput<Tdata, Tcompute>(sum * inv_kernel);
+    }
+}
+template <typename Tdata, typename Tcompute>
+infiniStatus_t calculateAvgPool1d(
+    const AvgPool1dInfo &info,
+    int max_threads_per_block,
+    Tdata *y,
+    const Tdata *x,
+    hcStream_t stream) {
+    size_t total_elements = info.batch * info.channels * info.out_width;
+    int block_size = 256;
+    if (max_threads_per_block > 0 && max_threads_per_block < block_size) {
+        block_size = max_threads_per_block;
+    }
+    size_t grid_size = (total_elements + block_size - 1) / block_size;
+    if (grid_size > 65535) {
+        grid_size = 65535;
+    }
+    avgPool1dGlobalKernel<Tdata, Tcompute><<<grid_size, block_size, 0, stream>>>(
+        y, x,
+        info.batch, info.channels, info.in_width, info.out_width,
+        info.kernel_size, info.stride, info.padding,
+        info.y_stride_batch, info.y_stride_channel, info.y_stride_width,
+        info.x_stride_batch, info.x_stride_channel, info.x_stride_width);
+    return INFINI_STATUS_SUCCESS;
+}
+#define CALCULATE(TDATA, TCOMPUTE) \
+    calculateAvgPool1d<TDATA, TCOMPUTE>( \
+        _info, \
+        _opaque->internal->maxThreadsPerBlock(), \
+        (TDATA *)y, \
+        (const TDATA *)x, \
+        (hcStream_t)stream)
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+    (void)workspace;
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    switch (_info.dtype) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE(half, float);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE(cuda_bfloat16, float);
+    case INFINI_DTYPE_F32:
+        return CALCULATE(float, float);
+    case INFINI_DTYPE_F64:
+        return CALCULATE(double, double);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+#undef CALCULATE
+} // namespace op::avg_pool1d::metax
--- a/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_kernel.h
+++ b/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_kernel.h
+#ifndef __INFINIOP_AVG_POOL1D_MOORE_KERNEL_H__
+#define __INFINIOP_AVG_POOL1D_MOORE_KERNEL_H__
+#include <type_traits>
+namespace op::avg_pool1d::moore {
+template <typename Tdata, typename Tcompute>
+__device__ __forceinline__ Tdata castToOutput(Tcompute val) {
+    if constexpr (std::is_same_v<Tdata, half>) {
+        return __float2half(static_cast<float>(val));
+    } else if constexpr (std::is_same_v<Tdata, cuda_bfloat16>) {
+        return __float2bfloat16_rn(static_cast<float>(val));
+    } else {
+        return static_cast<Tdata>(val);
+    }
+}
+template <typename Tdata, typename Tcompute>
+__device__ void avgPool1dKernel(
+    Tdata *y,
+    const Tdata *x,
+    size_t batch,
+    size_t channels,
+    size_t in_width,
+    size_t out_width,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding,
+    ptrdiff_t y_stride_batch,
+    ptrdiff_t y_stride_channel,
+    ptrdiff_t y_stride_width,
+    ptrdiff_t x_stride_batch,
+    ptrdiff_t x_stride_channel,
+    ptrdiff_t x_stride_width) {
+    size_t total_elements = batch * channels * out_width;
+    Tcompute inv_kernel = Tcompute(1) / static_cast<Tcompute>(kernel_size);
+    for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+         idx < total_elements;
+         idx += gridDim.x * blockDim.x) {
+        size_t ow = idx % out_width;
+        size_t temp = idx / out_width;
+        size_t c = temp % channels;
+        size_t b = temp / channels;
+        size_t y_offset = b * y_stride_batch + c * y_stride_channel + ow * y_stride_width;
+        size_t x_base = b * x_stride_batch + c * x_stride_channel;
+        long long start_w = static_cast<long long>(ow * stride) - static_cast<long long>(padding);
+        long long end_w = start_w + static_cast<long long>(kernel_size);
+        long long iw_start = start_w < 0 ? 0 : start_w;
+        long long iw_end = end_w > static_cast<long long>(in_width) ? static_cast<long long>(in_width) : end_w;
+        Tcompute sum = Tcompute(0);
+        if (iw_start < iw_end) {
+            size_t x_offset = x_base + static_cast<size_t>(iw_start) * x_stride_width;
+            for (long long iw = iw_start; iw < iw_end; ++iw) {
+                sum += static_cast<Tcompute>(x[x_offset]);
+                x_offset += x_stride_width;
+            }
+        }
+        y[y_offset] = castToOutput<Tdata, Tcompute>(sum * inv_kernel);
+    }
+}
+} // namespace op::avg_pool1d::moore
+#endif // __INFINIOP_AVG_POOL1D_MOORE_KERNEL_H__
--- a/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.h
+++ b/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.h
+#ifndef __INFINIOP_AVG_POOL1D_MOORE_H__
+#define __INFINIOP_AVG_POOL1D_MOORE_H__
+#include "../avg_pool1d.h"
+DESCRIPTOR(moore)
+#endif // __INFINIOP_AVG_POOL1D_MOORE_H__
--- a/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.mu
+++ b/src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.mu
+#include "../../../devices/moore/moore_common.h"
+#include "avg_pool1d_moore.h"
+#include "../../../devices/moore/moore_kernel_common.h"
+#include "avg_pool1d_kernel.h"
+namespace op::avg_pool1d::moore {
+struct Descriptor::Opaque {
+    std::shared_ptr<device::moore::Handle::Internal> internal;
+};
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding) {
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding);
+    CHECK_RESULT(info);
+    *desc_ptr = new Descriptor(
+        info.take(),
+        0,
+        new Opaque{handle->internal()},
+        handle->device,
+        handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+template <typename Tdata, typename Tcompute>
+INFINIOP_MOORE_KERNEL avgPool1dGlobalKernel(
+    Tdata *y,
+    const Tdata *x,
+    size_t batch,
+    size_t channels,
+    size_t in_width,
+    size_t out_width,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding,
+    ptrdiff_t y_stride_batch,
+    ptrdiff_t y_stride_channel,
+    ptrdiff_t y_stride_width,
+    ptrdiff_t x_stride_batch,
+    ptrdiff_t x_stride_channel,
+    ptrdiff_t x_stride_width) {
+    avgPool1dKernel<Tdata, Tcompute>(
+        y, x,
+        batch, channels, in_width, out_width,
+        kernel_size, stride, padding,
+        y_stride_batch, y_stride_channel, y_stride_width,
+        x_stride_batch, x_stride_channel, x_stride_width);
+}
+template <typename Tdata, typename Tcompute>
+infiniStatus_t calculateAvgPool1d(
+    const AvgPool1dInfo &info,
+    int max_threads_per_block,
+    Tdata *y,
+    const Tdata *x,
+    musaStream_t stream) {
+    size_t total_elements = info.batch * info.channels * info.out_width;
+    int block_size = 256;
+    if (max_threads_per_block > 0 && max_threads_per_block < block_size) {
+        block_size = max_threads_per_block;
+    }
+    size_t grid_size = (total_elements + block_size - 1) / block_size;
+    if (grid_size > 65535) {
+        grid_size = 65535;
+    }
+    avgPool1dGlobalKernel<Tdata, Tcompute><<<grid_size, block_size, 0, stream>>>(
+        y, x,
+        info.batch, info.channels, info.in_width, info.out_width,
+        info.kernel_size, info.stride, info.padding,
+        info.y_stride_batch, info.y_stride_channel, info.y_stride_width,
+        info.x_stride_batch, info.x_stride_channel, info.x_stride_width);
+    return INFINI_STATUS_SUCCESS;
+}
+#define CALCULATE(TDATA, TCOMPUTE) \
+    calculateAvgPool1d<TDATA, TCOMPUTE>(\
+        _info,\
+        _opaque->internal->maxThreadsPerBlock(),\
+        (TDATA *)y,\
+        (const TDATA *)x,\
+        (musaStream_t)stream)
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+    (void)workspace;
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    switch (_info.dtype) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE(half, float);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE(cuda_bfloat16, float);
+    case INFINI_DTYPE_F32:
+        return CALCULATE(float, float);
+    case INFINI_DTYPE_F64:
+        return CALCULATE(double, double);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+#undef CALCULATE
+} // namespace op::avg_pool1d::moore
--- a/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cu
+++ b/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cu
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../cuda/kernel.cuh"
+#include "avg_pool1d_nvidia.cuh"
+template <typename T>
+__global__ void avgPool1dGlobalKernel(
+    T *y,
+    const T *x,
+    size_t batch,
+    size_t channels,
+    size_t in_width,
+    size_t out_width,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding,
+    ptrdiff_t y_stride_batch,
+    ptrdiff_t y_stride_channel,
+    ptrdiff_t y_stride_width,
+    ptrdiff_t x_stride_batch,
+    ptrdiff_t x_stride_channel,
+    ptrdiff_t x_stride_width) {
+    avgPool1dKernel<T>(
+        y, x,
+        batch, channels, in_width, out_width,
+        kernel_size, stride, padding,
+        y_stride_batch, y_stride_channel, y_stride_width,
+        x_stride_batch, x_stride_channel, x_stride_width);
+}
+namespace op::avg_pool1d::nvidia {
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding) {
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding);
+    CHECK_RESULT(info);
+    *desc_ptr = new Descriptor(
+        info.take(),
+        0,
+        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
+        handle->device,
+        handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+template <typename T>
+infiniStatus_t calculateAvgPool1d(
+    const AvgPool1dInfo &info,
+    int max_threads_per_block,
+    T *y,
+    const T *x,
+    cudaStream_t stream) {
+    size_t total_elements = info.batch * info.channels * info.out_width;
+    int block_size = 256;
+    if (max_threads_per_block > 0 && max_threads_per_block < 256) {
+        block_size = max_threads_per_block;
+    }
+    size_t grid_size = (total_elements + block_size - 1) / block_size;
+    if (grid_size > 65535) {
+        grid_size = 65535;
+    }
+    avgPool1dGlobalKernel<T><<<grid_size, block_size, 0, stream>>>(
+        y, x,
+        info.batch, info.channels, info.in_width, info.out_width,
+        info.kernel_size, info.stride, info.padding,
+        info.y_stride_batch, info.y_stride_channel, info.y_stride_width,
+        info.x_stride_batch, info.x_stride_channel, info.x_stride_width);
+    return INFINI_STATUS_SUCCESS;
+}
+#define CALCULATE(TDATA)                                        \
+    calculateAvgPool1d(_info,                                   \
+                       _opaque->internal->maxThreadsPerBlock(), \
+                       (TDATA *)y,                              \
+                       (const TDATA *)x,                        \
+                       (cudaStream_t)stream)
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+    switch (_info.dtype) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE(half);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE(cuda_bfloat16);
+    case INFINI_DTYPE_F32:
+        return CALCULATE(float);
+    case INFINI_DTYPE_F64:
+        return CALCULATE(double);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+#undef CALCULATE
+} // namespace op::avg_pool1d::nvidia
--- a/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cuh
+++ b/src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cuh
+#ifndef __INFINIOP_AVG_POOL1D_CUDA_H__
+#define __INFINIOP_AVG_POOL1D_CUDA_H__
+#include "../avg_pool1d.h"
+DESCRIPTOR(nvidia)
+#endif
--- a/src/infiniop/ops/avg_pool1d/operator.cc
+++ b/src/infiniop/ops/avg_pool1d/operator.cc
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/avg_pool1d.h"
+#ifdef ENABLE_CPU_API
+#include "cpu/avg_pool1d_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
+#include "nvidia/avg_pool1d_nvidia.cuh"
+#endif
+#ifdef ENABLE_ASCEND_API
+#include "ascend/avg_pool1d_ascend.h"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/avg_pool1d_bang.h"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/avg_pool1d_metax.h"
+#endif
+#ifdef ENABLE_KUNLUN_API
+#include "kunlun/avg_pool1d_kunlun.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/avg_pool1d_moore.h"
+#endif
+__INFINI_C infiniStatus_t infiniopCreateAvgPool1dDescriptor(
+    infiniopHandle_t handle,
+    infiniopAvgPool1dDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y,
+    infiniopTensorDescriptor_t x,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding) {
+#define CREATE(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                    \
+        return op::avg_pool1d::NAMESPACE::Descriptor::create(                     \
+            handle,                                                               \
+            reinterpret_cast<op::avg_pool1d::NAMESPACE::Descriptor **>(desc_ptr), \
+            y,                                                                    \
+            x,                                                                    \
+            kernel_size,                                                          \
+            stride,                                                               \
+            padding)
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_HYGON_API
+        CREATE(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ASCEND_API
+        CREATE(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CREATE
+}
+__INFINI_C infiniStatus_t infiniopGetAvgPool1dWorkspaceSize(infiniopAvgPool1dDescriptor_t desc,
+                                                            size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                            \
+    case CASE:                                                                                          \
+        *size = reinterpret_cast<const op::avg_pool1d::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_HYGON_API
+        GET(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        GET(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_ASCEND_API
+        GET(INFINI_DEVICE_ASCEND, ascend);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+}
+__INFINI_C infiniStatus_t infiniopAvgPool1d(
+    infiniopAvgPool1dDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+#define CALCULATE(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                       \
+        return reinterpret_cast<const op::avg_pool1d::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, x, stream)
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_HYGON_API
+        CALCULATE(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_ASCEND_API
+        CALCULATE(INFINI_DEVICE_ASCEND, ascend);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CALCULATE
+}
+__INFINI_C infiniStatus_t
+infiniopDestroyAvgPool1dDescriptor(infiniopAvgPool1dDescriptor_t desc) {
+#define DELETE(CASE, NAMESPACE)                                                       \
+    case CASE:                                                                        \
+        delete reinterpret_cast<const op::avg_pool1d::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_HYGON_API
+        DELETE(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_ASCEND_API
+        DELETE(INFINI_DEVICE_ASCEND, ascend);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef DELETE
+}
--- a/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.cc
+++ b/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.cc
+#include "cross_entropy_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../reduce/cpu/reduce.h"
+#include <algorithm>
+#include <cmath>
+namespace op::cross_entropy::cpu {
+Descriptor::~Descriptor() = default;
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t target_desc) {
+    auto x_dtype = x_desc->dtype();
+    auto t_dtype = target_desc->dtype();
+    CHECK_DTYPE(x_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+    CHECK_DTYPE(t_dtype, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
+    CrossEntropyInfo info{};
+    info.dtype = x_dtype;
+    info.target_dtype = t_dtype;
+    info.outer_size = target_desc->numel();
+    info.vocab_size = x_desc->shape().back();
+    info.x_stride = static_cast<ptrdiff_t>(info.vocab_size);
+    *desc_ptr = new Descriptor(nullptr, info, 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+template <typename T, typename Tidx>
+infiniStatus_t cross_entropy_kernel(const CrossEntropyInfo *info,
+                                    T *y, const T *x, const void *target) {
+    const Tidx *label = reinterpret_cast<const Tidx *>(target);
+#pragma omp parallel for
+    for (ptrdiff_t i = 0; i < ptrdiff_t(info->outer_size); ++i) {
+        const T *row = x + i * info->x_stride;
+        Tidx idx = label[i];
+        if (idx < 0 || static_cast<size_t>(idx) >= info->vocab_size) {
+            y[i] = utils::cast<T>(0.f);
+            continue;
+        }
+        float max_val = op::common_cpu::reduce_op::max(row, info->vocab_size, 1);
+        float sum_exp = 0.f;
+        for (size_t j = 0; j < info->vocab_size; ++j) {
+            sum_exp += std::exp(utils::cast<float>(row[j]) - max_val);
+        }
+        float log_term = std::log(sum_exp) + max_val;
+        float target_logit = utils::cast<float>(row[idx]);
+        y[i] = utils::cast<T>(log_term - target_logit);
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+template <typename T>
+infiniStatus_t dispatch_target_type(const CrossEntropyInfo *info,
+                                    T *y, const T *x, const void *target) {
+    if (info->target_dtype == INFINI_DTYPE_I32) {
+        return cross_entropy_kernel<T, int32_t>(info, y, x, target);
+    } else if (info->target_dtype == INFINI_DTYPE_I64) {
+        return cross_entropy_kernel<T, int64_t>(info, y, x, target);
+    }
+    return INFINI_STATUS_BAD_TENSOR_DTYPE;
+}
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    const void *target,
+    void *stream) const {
+    switch (_info.dtype) {
+    case INFINI_DTYPE_F16:
+        return dispatch_target_type(&_info, (fp16_t *)y, (const fp16_t *)x, target);
+    case INFINI_DTYPE_BF16:
+        return dispatch_target_type(&_info, (bf16_t *)y, (const bf16_t *)x, target);
+    case INFINI_DTYPE_F32:
+        return dispatch_target_type(&_info, (float *)y, (const float *)x, target);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+} // namespace op::cross_entropy::cpu
--- a/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.h
+++ b/src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.h
+#ifndef __CROSS_ENTROPY_CPU_H__
+#define __CROSS_ENTROPY_CPU_H__
+#include "../cross_entropy.h"
+DESCRIPTOR(cpu)
+#endif
--- a/src/infiniop/ops/cross_entropy/cross_entropy.h
+++ b/src/infiniop/ops/cross_entropy/cross_entropy.h
+#ifndef CROSS_ENTROPY_H
+#define CROSS_ENTROPY_H
+#include "../../operator.h"
+#include "info.h"
+#define DESCRIPTOR(NAMESPACE)                                                 \
+    namespace op::cross_entropy::NAMESPACE {                                  \
+    class Descriptor final : public InfiniopDescriptor {                      \
+        struct Opaque;                                                        \
+        Opaque *_opaque;                                                      \
+        CrossEntropyInfo _info;                                               \
+        size_t _workspace_size;                                               \
+                                                                              \
+        Descriptor(Opaque *opaque,                                            \
+                   CrossEntropyInfo info,                                     \
+                   size_t workspace_size,                                     \
+                   infiniDevice_t device_type,                                \
+                   int device_id)                                             \
+            : InfiniopDescriptor{device_type, device_id},                     \
+              _opaque(opaque),                                                \
+              _info(info),                                                    \
+              _workspace_size(workspace_size) {}                              \
+                                                                              \
+    public:                                                                   \
+        ~Descriptor();                                                        \
+        size_t workspaceSize() const { return _workspace_size; }              \
+        static infiniStatus_t create(infiniopHandle_t handle,                 \
+                                     Descriptor **desc_ptr,                   \
+                                     infiniopTensorDescriptor_t y_desc,       \
+                                     infiniopTensorDescriptor_t x_desc,       \
+                                     infiniopTensorDescriptor_t target_desc); \
+        infiniStatus_t calculate(void *workspace,                             \
+                                 size_t workspace_size,                       \
+                                 void *y,                                     \
+                                 const void *x,                               \
+                                 const void *target,                          \
+                                 void *stream) const;                         \
+    };                                                                        \
+    }
+#endif
--- a/src/infiniop/ops/cross_entropy/cuda/kernel.cuh
+++ b/src/infiniop/ops/cross_entropy/cuda/kernel.cuh
+#ifndef __CROSS_ENTROPY_KERNEL_CUH__
+#define __CROSS_ENTROPY_KERNEL_CUH__
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../reduce/cuda/reduce.cuh"
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tidx, typename Tcompute = float>
+__device__ void crossEntropyKernel(
+    Tdata *y_,
+    const Tdata *x_,
+    const void *target_,
+    size_t outer_size,
+    size_t vocab_size,
+    ptrdiff_t x_stride) {
+    size_t row_idx = blockIdx.x;
+    if (row_idx >= outer_size) {
+        return;
+    }
+    const Tdata *x = x_ + row_idx * x_stride;
+    const Tidx *target = reinterpret_cast<const Tidx *>(target_);
+    Tidx label = target[row_idx];
+    Tdata max_val_raw = op::common_cuda::reduce_op::max<BLOCK_SIZE, Tdata>(x, vocab_size);
+    __shared__ Tcompute max_val_shared;
+    if (threadIdx.x == 0) {
+        max_val_shared = static_cast<Tcompute>(max_val_raw);
+    }
+    __syncthreads();
+    Tcompute max_val = max_val_shared;
+    Tcompute thread_sum = 0.0f;
+    for (size_t col = threadIdx.x; col < vocab_size; col += BLOCK_SIZE) {
+        Tcompute val = static_cast<Tcompute>(x[col]);
+        thread_sum += expf(val - max_val);
+    }
+    for (int offset = warpSize / 2; offset > 0; offset /= 2) {
+        thread_sum += __shfl_down_sync(0xffffffff, thread_sum, offset);
+    }
+    static __shared__ Tcompute shared_sum[32];
+    int lane = threadIdx.x % warpSize;
+    int warp = threadIdx.x / warpSize;
+    if (lane == 0) {
+        shared_sum[warp] = thread_sum;
+    }
+    __syncthreads();
+    Tcompute block_sum = 0.0f;
+    if (warp == 0) {
+        if (lane < (BLOCK_SIZE + warpSize - 1) / warpSize) {
+            block_sum = shared_sum[lane];
+        }
+        for (int offset = warpSize / 2; offset > 0; offset /= 2) {
+            block_sum += __shfl_down_sync(0xffffffff, block_sum, offset);
+        }
+    }
+    if (threadIdx.x == 0) {
+        Tcompute log_term = logf(block_sum) + max_val;
+        Tcompute target_logit = 0.0f;
+        if (label >= 0 && static_cast<size_t>(label) < vocab_size) {
+            target_logit = static_cast<Tcompute>(x[label]);
+        } else {
+            log_term = 0.0f;
+        }
+        y_[row_idx] = static_cast<Tdata>(log_term - target_logit);
+    }
+}
+#endif
--- a/src/infiniop/ops/cross_entropy/info.h
+++ b/src/infiniop/ops/cross_entropy/info.h
+#ifndef CROSS_ENTROPY_INFO_H
+#define CROSS_ENTROPY_INFO_H
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+#include <cstddef>
+struct CrossEntropyInfo {
+    int dtype;
+    int target_dtype;
+    size_t outer_size;
+    size_t vocab_size;
+    ptrdiff_t x_stride;
+};
+#endif
--- a/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.h
+++ b/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.h
+#ifndef __CROSS_ENTROPY_METAX_H__
+#define __CROSS_ENTROPY_METAX_H__
+#include "../cross_entropy.h"
+DESCRIPTOR(metax)
+#endif // __CROSS_ENTROPY_METAX_H__
--- a/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.maca
+++ b/src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.maca
+#include "../../../devices/metax/metax_common.h"
+#include "cross_entropy_metax.h"
+#include "../../../devices/metax/metax_kernel_common.h"
+#include <cub/block/block_reduce.cuh>
+#include "../../../reduce/cuda/reduce.cuh"
+#include <cmath>
+namespace {
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tidx, typename Tcompute>
+__device__ void crossEntropyKernel(
+    Tdata *y_,
+    const Tdata *x_,
+    const void *target_,
+    size_t outer_size,
+    size_t vocab_size,
+    ptrdiff_t x_stride) {
+    size_t row_idx = blockIdx.x;
+    if (row_idx >= outer_size) {
+        return;
+    }
+    const Tdata *x = x_ + row_idx * x_stride;
+    const Tidx *target = reinterpret_cast<const Tidx *>(target_);
+    Tidx label = target[row_idx];
+    Tdata max_val_raw = op::common_cuda::reduce_op::max<BLOCK_SIZE, Tdata>(x, vocab_size);
+    __shared__ Tcompute max_val_shared;
+    if (threadIdx.x == 0) {
+        max_val_shared = static_cast<Tcompute>(max_val_raw);
+    }
+    __syncthreads();
+    Tcompute max_val = max_val_shared;
+    Tcompute thread_sum = Tcompute(0);
+    for (size_t col = threadIdx.x; col < vocab_size; col += BLOCK_SIZE) {
+        Tcompute val = static_cast<Tcompute>(x[col]);
+        thread_sum += expf(val - max_val);
+    }
+    using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    Tcompute block_sum = BlockReduce(temp_storage).Sum(thread_sum);
+    if (threadIdx.x == 0) {
+        if (label < 0 || static_cast<size_t>(label) >= vocab_size) {
+            y_[row_idx] = static_cast<Tdata>(0.0f);
+            return;
+        }
+        Tcompute log_term = logf(block_sum) + max_val;
+        Tcompute target_logit = static_cast<Tcompute>(x[label]);
+        y_[row_idx] = static_cast<Tdata>(log_term - target_logit);
+    }
+}
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tidx, typename Tcompute>
+INFINIOP_METAX_KERNEL crossEntropy(
+    Tdata *y, const Tdata *x, const void *target,
+    size_t outer_size, size_t vocab_size, ptrdiff_t x_stride) {
+    crossEntropyKernel<BLOCK_SIZE, Tdata, Tidx, Tcompute>(
+        y, x, target, outer_size, vocab_size, x_stride);
+}
+} // namespace
+namespace op::cross_entropy::metax {
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t target_desc) {
+    (void)y_desc;
+    auto x_dtype = x_desc->dtype();
+    auto t_dtype = target_desc->dtype();
+    CHECK_DTYPE(x_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
+    CHECK_DTYPE(t_dtype, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
+    CrossEntropyInfo info{};
+    info.dtype = x_dtype;
+    info.target_dtype = t_dtype;
+    info.vocab_size = x_desc->shape().back();
+    info.outer_size = target_desc->numel();
+    info.x_stride = static_cast<ptrdiff_t>(info.vocab_size);
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
+        info, 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchKernel(void *y, const void *x, const void *target,
+                            const CrossEntropyInfo &info, hcStream_t stream) {
+    dim3 grid(static_cast<uint32_t>(info.outer_size), 1, 1);
+    if (info.target_dtype == INFINI_DTYPE_I64) {
+        if (info.dtype == INFINI_DTYPE_F16) {
+            crossEntropy<BLOCK_SIZE, half, int64_t, float>
+                <<<grid, BLOCK_SIZE, 0, stream>>>(
+                    (half *)y, (const half *)x, target,
+                    info.outer_size, info.vocab_size, info.x_stride);
+        } else if (info.dtype == INFINI_DTYPE_BF16) {
+            crossEntropy<BLOCK_SIZE, cuda_bfloat16, int64_t, float>
+                <<<grid, BLOCK_SIZE, 0, stream>>>(
+                    (cuda_bfloat16 *)y, (const cuda_bfloat16 *)x, target,
+                    info.outer_size, info.vocab_size, info.x_stride);
+        } else if (info.dtype == INFINI_DTYPE_F32) {
+            crossEntropy<BLOCK_SIZE, float, int64_t, float>
+                <<<grid, BLOCK_SIZE, 0, stream>>>(
+                    (float *)y, (const float *)x, target,
+                    info.outer_size, info.vocab_size, info.x_stride);
+        } else {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+    } else if (info.target_dtype == INFINI_DTYPE_I32) {
+        if (info.dtype == INFINI_DTYPE_F16) {
+            crossEntropy<BLOCK_SIZE, half, int32_t, float>
+                <<<grid, BLOCK_SIZE, 0, stream>>>(
+                    (half *)y, (const half *)x, target,
+                    info.outer_size, info.vocab_size, info.x_stride);
+        } else if (info.dtype == INFINI_DTYPE_BF16) {
+            crossEntropy<BLOCK_SIZE, cuda_bfloat16, int32_t, float>
+                <<<grid, BLOCK_SIZE, 0, stream>>>(
+                    (cuda_bfloat16 *)y, (const cuda_bfloat16 *)x, target,
+                    info.outer_size, info.vocab_size, info.x_stride);
+        } else if (info.dtype == INFINI_DTYPE_F32) {
+            crossEntropy<BLOCK_SIZE, float, int32_t, float>
+                <<<grid, BLOCK_SIZE, 0, stream>>>(
+                    (float *)y, (const float *)x, target,
+                    info.outer_size, info.vocab_size, info.x_stride);
+        } else {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    const void *target,
+    void *stream_) const {
+    (void)workspace;
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    auto stream = reinterpret_cast<hcStream_t>(stream_);
+    int max_threads = _opaque->internal->maxThreadsPerBlock();
+    if (max_threads >= METAX_BLOCK_SIZE_1024) {
+        CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_1024>(y, x, target, _info, stream));
+    } else if (max_threads >= METAX_BLOCK_SIZE_512) {
+        CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_512>(y, x, target, _info, stream));
+    } else {
+        CHECK_STATUS(launchKernel<256>(y, x, target, _info, stream));
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::cross_entropy::metax
--- a/src/infiniop/ops/cross_entropy/moore/cross_entropy_kernel.h
+++ b/src/infiniop/ops/cross_entropy/moore/cross_entropy_kernel.h
+#ifndef __CROSS_ENTROPY_KERNEL_CUH__
+#define __CROSS_ENTROPY_KERNEL_CUH__
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tidx, typename Tcompute>
+__device__ void crossEntropyKernel(
+    Tdata *y_,
+    const Tdata *x_,
+    const void *target_,
+    size_t outer_size,
+    size_t vocab_size,
+    ptrdiff_t x_stride) {
+    size_t row_idx = blockIdx.x;
+    if (row_idx >= outer_size) {
+        return;
+    }
+    const Tdata *x = x_ + row_idx * x_stride;
+    const Tidx *target = reinterpret_cast<const Tidx *>(target_);
+    Tidx label = target[row_idx];
+    Tdata max_val_raw = op::common_cuda::reduce_op::max<BLOCK_SIZE, Tdata>(x, vocab_size);
+    __shared__ Tcompute max_val_shared;
+    if (threadIdx.x == 0) {
+        max_val_shared = static_cast<Tcompute>(max_val_raw);
+    }
+    __syncthreads();
+    Tcompute max_val = max_val_shared;
+    Tcompute thread_sum = Tcompute(0);
+    for (size_t col = threadIdx.x; col < vocab_size; col += BLOCK_SIZE) {
+        Tcompute val = static_cast<Tcompute>(x[col]);
+        thread_sum += expf(val - max_val);
+    }
+    using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    Tcompute block_sum = BlockReduce(temp_storage).Sum(thread_sum);
+    if (threadIdx.x == 0) {
+        if (label < 0 || static_cast<size_t>(label) >= vocab_size) {
+            y_[row_idx] = static_cast<Tdata>(0.0f);
+            return;
+        }
+        Tcompute log_term = logf(block_sum) + max_val;
+        Tcompute target_logit = static_cast<Tcompute>(x[label]);
+        y_[row_idx] = static_cast<Tdata>(log_term - target_logit);
+    }
+}
+#endif
--- a/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.h
+++ b/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.h
+#ifndef __CROSS_ENTROPY_MOORE_H__
+#define __CROSS_ENTROPY_MOORE_H__
+#include "../cross_entropy.h"
+DESCRIPTOR(moore)
+#endif
--- a/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.mu
+++ b/src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.mu
+#include "../../../devices/moore/moore_common.h"
+#include "cross_entropy_moore.h"
+#include <cub/block/block_reduce.cuh>
+#include "../../../devices/moore/moore_kernel_common.h"
+#include "../../../reduce/cuda/reduce.cuh"
+#include "cross_entropy_kernel.h"
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tidx, typename Tcompute>
+INFINIOP_MOORE_KERNEL crossEntropy(
+    Tdata *y, const Tdata *x, const void *target,
+    size_t outer_size, size_t vocab_size, ptrdiff_t x_stride) {
+    crossEntropyKernel<BLOCK_SIZE, Tdata, Tidx, Tcompute>(
+        y, x, target, outer_size, vocab_size, x_stride);
+}
+namespace op::cross_entropy::moore {
+struct Descriptor::Opaque {
+    std::shared_ptr<device::moore::Handle::Internal> internal;
+};
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t target_desc) {
+    (void)y_desc;
+    auto x_dtype = x_desc->dtype();
+    auto t_dtype = target_desc->dtype();
+    CHECK_DTYPE(x_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
+    CHECK_DTYPE(t_dtype, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
+    CrossEntropyInfo info{};
+    info.dtype = x_dtype;
+    info.target_dtype = t_dtype;
+    info.vocab_size = x_desc->shape().back();
+    info.outer_size = target_desc->numel();
+    info.x_stride = static_cast<ptrdiff_t>(info.vocab_size);
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
+        info, 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchKernel(void *y, const void *x, const void *target,
+                            const CrossEntropyInfo &info, musaStream_t stream) {
+    dim3 grid(static_cast<uint32_t>(info.outer_size), 1, 1);
+    if (info.target_dtype == INFINI_DTYPE_I64) {
+        if (info.dtype == INFINI_DTYPE_F16) {
+            crossEntropy<BLOCK_SIZE, half, int64_t, float>
+                <<<grid, BLOCK_SIZE, 0, stream>>>(
+                    (half *)y, (const half *)x, target,
+                    info.outer_size, info.vocab_size, info.x_stride);
+        } else if (info.dtype == INFINI_DTYPE_BF16) {
+            crossEntropy<BLOCK_SIZE, __mt_bfloat16, int64_t, float>
+                <<<grid, BLOCK_SIZE, 0, stream>>>(
+                    (__mt_bfloat16 *)y, (const __mt_bfloat16 *)x, target,
+                    info.outer_size, info.vocab_size, info.x_stride);
+        } else if (info.dtype == INFINI_DTYPE_F32) {
+            crossEntropy<BLOCK_SIZE, float, int64_t, float>
+                <<<grid, BLOCK_SIZE, 0, stream>>>(
+                    (float *)y, (const float *)x, target,
+                    info.outer_size, info.vocab_size, info.x_stride);
+        } else {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+    } else if (info.target_dtype == INFINI_DTYPE_I32) {
+        if (info.dtype == INFINI_DTYPE_F16) {
+            crossEntropy<BLOCK_SIZE, half, int32_t, float>
+                <<<grid, BLOCK_SIZE, 0, stream>>>(
+                    (half *)y, (const half *)x, target,
+                    info.outer_size, info.vocab_size, info.x_stride);
+        } else if (info.dtype == INFINI_DTYPE_BF16) {
+            crossEntropy<BLOCK_SIZE, __mt_bfloat16, int32_t, float>
+                <<<grid, BLOCK_SIZE, 0, stream>>>(
+                    (__mt_bfloat16 *)y, (const __mt_bfloat16 *)x, target,
+                    info.outer_size, info.vocab_size, info.x_stride);
+        } else if (info.dtype == INFINI_DTYPE_F32) {
+            crossEntropy<BLOCK_SIZE, float, int32_t, float>
+                <<<grid, BLOCK_SIZE, 0, stream>>>(
+                    (float *)y, (const float *)x, target,
+                    info.outer_size, info.vocab_size, info.x_stride);
+        } else {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *y,
+                                     const void *x,
+                                     const void *target,
+                                     void *stream_) const {
+    musaStream_t stream = (musaStream_t)stream_;
+    (void)workspace;
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) {
+        CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_1024>(y, x, target, _info, stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) {
+        CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_512>(y, x, target, _info, stream));
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::cross_entropy::moore
--- a/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cu
+++ b/src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cu
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../cuda/kernel.cuh"
+#include "cross_entropy_nvidia.cuh"
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tidx, typename Tcompute = float>
+INFINIOP_CUDA_KERNEL crossEntropy(
+    Tdata *y, const Tdata *x, const void *target,
+    size_t outer_size, size_t vocab_size, ptrdiff_t x_stride) {
+    crossEntropyKernel<BLOCK_SIZE, Tdata, Tidx, Tcompute>(
+        y, x, target, outer_size, vocab_size, x_stride);
+}
+namespace op::cross_entropy::nvidia {
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t target_desc) {
+    auto x_dtype = x_desc->dtype();
+    auto t_dtype = target_desc->dtype();
+    CrossEntropyInfo info;
+    info.dtype = x_dtype;
+    info.target_dtype = t_dtype;
+    info.vocab_size = x_desc->shape().back();
+    info.outer_size = target_desc->numel();
+    info.x_stride = static_cast<ptrdiff_t>(info.vocab_size);
+    auto internal = reinterpret_cast<device::nvidia::Handle *>(handle)->internal();
+    *desc_ptr = new Descriptor(
+        new Opaque{internal},
+        info, 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchKernel(void *y, const void *x, const void *target,
+                            const CrossEntropyInfo &info, cudaStream_t stream) {
+    dim3 grid(static_cast<uint32_t>(info.outer_size), 1, 1);
+    if (info.target_dtype == INFINI_DTYPE_I64) {
+        if (info.dtype == INFINI_DTYPE_F16) {
+            crossEntropy<BLOCK_SIZE, half, int64_t>
+                <<<grid, BLOCK_SIZE, 0, stream>>>((half *)y, (const half *)x, target, info.outer_size, info.vocab_size, info.x_stride);
+        } else if (info.dtype == INFINI_DTYPE_BF16) {
+            crossEntropy<BLOCK_SIZE, __nv_bfloat16, int64_t>
+                <<<grid, BLOCK_SIZE, 0, stream>>>((__nv_bfloat16 *)y, (const __nv_bfloat16 *)x, target, info.outer_size, info.vocab_size, info.x_stride);
+        } else if (info.dtype == INFINI_DTYPE_F32) {
+            crossEntropy<BLOCK_SIZE, float, int64_t>
+                <<<grid, BLOCK_SIZE, 0, stream>>>((float *)y, (const float *)x, target, info.outer_size, info.vocab_size, info.x_stride);
+        }
+    } else if (info.target_dtype == INFINI_DTYPE_I32) {
+        if (info.dtype == INFINI_DTYPE_F16) {
+            crossEntropy<BLOCK_SIZE, half, int32_t>
+                <<<grid, BLOCK_SIZE, 0, stream>>>((half *)y, (const half *)x, target, info.outer_size, info.vocab_size, info.x_stride);
+        } else if (info.dtype == INFINI_DTYPE_BF16) {
+            crossEntropy<BLOCK_SIZE, __nv_bfloat16, int32_t>
+                <<<grid, BLOCK_SIZE, 0, stream>>>((__nv_bfloat16 *)y, (const __nv_bfloat16 *)x, target, info.outer_size, info.vocab_size, info.x_stride);
+        } else if (info.dtype == INFINI_DTYPE_F32) {
+            crossEntropy<BLOCK_SIZE, float, int32_t>
+                <<<grid, BLOCK_SIZE, 0, stream>>>((float *)y, (const float *)x, target, info.outer_size, info.vocab_size, info.x_stride);
+        }
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *y,
+                                     const void *x,
+                                     const void *target,
+                                     void *stream_) const {
+    cudaStream_t stream = (cudaStream_t)stream_;
+    int max_threads = _opaque->internal->maxThreadsPerBlock();
+    if (max_threads >= 1024) {
+        CHECK_STATUS(launchKernel<1024>(y, x, target, _info, stream));
+    } else if (max_threads >= 512) {
+        CHECK_STATUS(launchKernel<512>(y, x, target, _info, stream));
+    } else {
+        CHECK_STATUS(launchKernel<256>(y, x, target, _info, stream));
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::cross_entropy::nvidia