Merge pull request #1069 from InfiniTensor/issue/1031_T1_1_15

【算子比赛2025秋】T1-1-15

Merge pull request #1069 from InfiniTensor/issue/1031_T1_1_15
【算子比赛2025秋】T1-1-15
21c6af2d · thatPepe · GitHub · 99a802dd · 5f329d7a · 21c6af2d
Unverified Commit 21c6af2d authored Mar 11, 2026 by thatPepe Committed by GitHub Mar 11, 2026
20 changed files
--- a/src/infiniop/ops/cdist/cpu/cdist_cpu.h
+++ b/src/infiniop/ops/cdist/cpu/cdist_cpu.h
+#ifndef __CDIST_CPU_H__
+#define __CDIST_CPU_H__
+
+#include "../cdist.h"
+
+// 使用 cdist.h 中定义的 DESCRIPTOR 宏
+// 这将在命名空间 op::cdist::cpu 中生成 Descriptor 类
+// 该类包含对 CdistInfo 的引用以及 create/calculate 等接口
+DESCRIPTOR(cpu)
+
+#endif // __CDIST_CPU_H__
--- a/src/infiniop/ops/cdist/info.h
+++ b/src/infiniop/ops/cdist/info.h
+#ifndef __CDIST_INFO_H__
+#define __CDIST_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include <algorithm>
+
+namespace op::cdist {
+
+/**
+ * 借用 BlasMatrix 的概念来描述 cdist 的输入输出矩阵
+ * x1: (Batch, M, D)
+ * x2: (Batch, N, D)
+ * y:  (Batch, M, N)
+ */
+struct CdistMatrix {
+    size_t ndim;
+    size_t batch;
+    ptrdiff_t stride; // Batch 之间的步长
+    size_t rows;      // M 或 N
+    size_t cols;      // D (特征维度) 或结果中的 N
+    ptrdiff_t row_stride;
+    ptrdiff_t col_stride;
+
+    static utils::Result<CdistMatrix> create(infiniopTensorDescriptor_t layout) {
+        CdistMatrix ans;
+        auto ndim = layout->ndim();
+
+        if (ndim == 2) {
+            ans.ndim = 2;
+            ans.batch = 1;
+            ans.stride = 0;
+            ans.rows = layout->dim(0);
+            ans.cols = layout->dim(1);
+            ans.row_stride = layout->stride(0);
+            ans.col_stride = layout->stride(1);
+        } else if (ndim == 3) {
+            ans.ndim = 3;
+            ans.batch = layout->dim(0);
+            ans.stride = ans.batch == 1 ? 0 : layout->stride(0);
+            ans.rows = layout->dim(1);
+            ans.cols = layout->dim(2);
+            ans.row_stride = layout->stride(1);
+            ans.col_stride = layout->stride(2);
+        } else {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        return utils::Result<CdistMatrix>(ans);
+    }
+
+    bool match_batch(size_t _batch) const {
+        return batch == _batch || batch == 1;
+    }
+};
+
+class CdistInfo {
+    CdistInfo() = default;
+
+public:
+    CdistMatrix x1_matrix;
+    CdistMatrix x2_matrix;
+    CdistMatrix y_matrix;
+
+    size_t m, n, d, batch;
+
+    static utils::Result<CdistInfo> create(
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x1_desc,
+        infiniopTensorDescriptor_t x2_desc) {
+
+        auto x1_res = CdistMatrix::create(x1_desc);
+        CHECK_RESULT(x1_res);
+
+        auto x2_res = CdistMatrix::create(x2_desc);
+        CHECK_RESULT(x2_res);
+
+        auto y_res = CdistMatrix::create(y_desc);
+        CHECK_RESULT(y_res);
+
+        auto x1 = x1_res.take();
+        auto x2 = x2_res.take();
+        auto y = y_res.take();
+
+        // 1. 维度校验
+        // x1(M, D), x2(N, D) -> y(M, N)
+        if (x1.cols != x2.cols) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE; // 特征维度 D 必须一致
+        }
+        if (y.rows != x1.rows || y.cols != x2.rows) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE; // 输出形状必须为 M x N
+        }
+
+        // 2. Batch 校验
+        size_t batch_size = y.batch;
+        if (!x1.match_batch(batch_size) || !x2.match_batch(batch_size)) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        size_t m = x1.rows;
+        size_t n = x2.rows;
+        size_t d = x1.cols;
+
+        return utils::Result<CdistInfo>(CdistInfo{
+            x1, x2, y,
+            m, n, d, batch_size});
+    }
+};
+
+} // namespace op::cdist
+
+#endif // __CDIST_INFO_H__
--- a/src/infiniop/ops/cdist/metax/cdist_metax.h
+++ b/src/infiniop/ops/cdist/metax/cdist_metax.h
+#ifndef __CDIST_METAX_CUH__
+#define __CDIST_METAX_CUH__
+
+#include "../cdist.h"
+
+/**
+ * 使用 cdist.h 中定义的 DESCRIPTOR 宏。
+ * 这将在命名空间 op::cdist::metax 中生成针对 METAX 设备的 Descriptor 类。
+ * * 在 METAX 端的具体实现中，Opaque 结构体通常会存储：
+ * - cublasHandle_t: 用于 p=2.0 时的矩阵乘法加速。
+ * - cudaStream_t: 当前执行的任务流。
+ * - 自定义 Kernel 的配置参数。
+ */
+DESCRIPTOR(metax)
+
+#endif // __CDIST_METAX_CUH__
--- a/src/infiniop/ops/cdist/metax/cdist_metax.maca
+++ b/src/infiniop/ops/cdist/metax/cdist_metax.maca
+#include "../../../devices/metax/metax_handle.h"
+#include "cdist_metax.h"
+#include <iostream>
+namespace op::cdist::metax {
+
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x1_desc,
+    infiniopTensorDescriptor_t x2_desc,
+    double p) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = y_desc->dtype();
+
+    // 目前 METAX 后端仅支持 F32，测试也是 F32
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F32);
+
+    auto result = CdistInfo::create(y_desc, x1_desc, x2_desc);
+    CHECK_RESULT(result);
+
+    // 当前实现不使用 workspace
+    *desc_ptr = new Descriptor(
+        dtype, result.take(), p, 0,
+        nullptr,
+        handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+// --- Kernel 1: L2 Epilogue ---
+// 保留占位，当前实现未使用 GEMM 加速路径
+template <typename T>
+__global__ void cdist_l2_epilogue_kernel(T *y, const T *x1_norm, const T *x2_norm,
+                                         int M, int N, int batch_stride_y) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    int b = blockIdx.z;
+
+    if (i < M && j < N) {
+        int idx = b * batch_stride_y + i * N + j;
+        // GEMM 已经计算了 -2*x1*x2^T 并存入 y
+        float val = (float)x1_norm[b * M + i] + (float)x2_norm[b * N + j] + (float)y[idx];
+        y[idx] = (T)sqrtf(fmaxf(val, 0.0f));
+    }
+}
+
+// --- Kernel 2: Generic P-Norm (F32, 支持通用步长) ---
+__global__ void cdist_generic_kernel_f32(
+    float *y,
+    const float *x1,
+    const float *x2,
+    size_t m,
+    size_t n,
+    size_t d,
+    ptrdiff_t x1_stride,
+    ptrdiff_t x1_row_stride,
+    ptrdiff_t x1_col_stride,
+    ptrdiff_t x2_stride,
+    ptrdiff_t x2_row_stride,
+    ptrdiff_t x2_col_stride,
+    ptrdiff_t y_stride,
+    ptrdiff_t y_row_stride,
+    ptrdiff_t y_col_stride,
+    double p) {
+
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    int b = blockIdx.z;
+
+    if (i >= (int)m || j >= (int)n) {
+        return;
+    }
+
+    // 定位输出位置 y[b, i, j]
+    float *y_ptr = y + b * y_stride + i * y_row_stride + j * y_col_stride;
+
+    // 定位向量位置 x1[b, i, :] 和 x2[b, j, :]
+    const float *x1_vec = x1 + b * x1_stride + i * x1_row_stride;
+    const float *x2_vec = x2 + b * x2_stride + j * x2_row_stride;
+
+    double dist = 0.0;
+
+    for (size_t k = 0; k < d; ++k) {
+        float v1 = *(x1_vec + k * x1_col_stride);
+        float v2 = *(x2_vec + k * x2_col_stride);
+        float diff = fabsf(v1 - v2);
+
+        if (p == 1.0) {
+            dist += diff;
+        } else if (p == 2.0) {
+            dist += diff * diff;
+        } else if (isinf(p)) {
+            dist = fmaxf((float)dist, diff);
+        } else {
+            dist += powf((float)diff, (float)p);
+        }
+    }
+
+    if (p == 2.0) {
+        dist = sqrtf((float)dist);
+    } else if (!isinf(p) && p != 1.0) {
+        dist = powf((float)dist, 1.0f / (float)p);
+    }
+
+    *y_ptr = (float)dist;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x1,
+    const void *x2,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    if (_dtype != INFINI_DTYPE_F32) {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    mcStream_t custream = (mcStream_t)stream;
+    dim3 block(16, 16);
+    dim3 grid(
+        static_cast<unsigned int>((_info.n + block.x - 1) / block.x),
+        static_cast<unsigned int>((_info.m + block.y - 1) / block.y),
+        static_cast<unsigned int>(_info.batch));
+
+    cdist_generic_kernel_f32<<<grid, block, 0, custream>>>(
+        static_cast<float *>(y),
+        static_cast<const float *>(x1),
+        static_cast<const float *>(x2),
+        _info.m,
+        _info.n,
+        _info.d,
+        _info.x1_matrix.stride,
+        _info.x1_matrix.row_stride,
+        _info.x1_matrix.col_stride,
+        _info.x2_matrix.stride,
+        _info.x2_matrix.row_stride,
+        _info.x2_matrix.col_stride,
+        _info.y_matrix.stride,
+        _info.y_matrix.row_stride,
+        _info.y_matrix.col_stride,
+        _p);
+
+    auto err = mcGetLastError();
+    if (err != mcSuccess) {
+        return INFINI_STATUS_INTERNAL_ERROR;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::cdist::metax
--- a/src/infiniop/ops/cdist/moore/cdist_moore.h
+++ b/src/infiniop/ops/cdist/moore/cdist_moore.h
+#ifndef __CDIST_MOORE_H__
+#define __CDIST_MOORE_H__
+
+#include "../cdist.h"
+
+/**
+ * 使用 cdist.h 中定义的 DESCRIPTOR 宏。
+ * 这将在命名空间 op::cdist::moore 中生成针对 Moore 设备的 Descriptor 类。
+ * * 在 Moore 端的具体实现中，Opaque 结构体通常会存储：
+ * - mublasHandle_t: 用于 p=2.0 时的矩阵乘法加速（对应 NVIDIA 的 cuBLAS）。
+ * - musaStream_t: 当前执行的任务流。
+ * - 自定义 Kernel 的配置参数。
+ */
+DESCRIPTOR(moore)
+
+#endif // __CDIST_MOORE_H__
--- a/src/infiniop/ops/cdist/moore/cdist_moore.mu
+++ b/src/infiniop/ops/cdist/moore/cdist_moore.mu
+#include "../../../devices/moore/moore_handle.h"
+#include "cdist_moore.h"
+#include <iostream>
+#include <musa_runtime.h>
+
+namespace op::cdist::moore {
+
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x1_desc,
+    infiniopTensorDescriptor_t x2_desc,
+    double p) {
+
+    // 1. 转换至 Moore 句柄
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = y_desc->dtype();
+
+    // 保持与原版一致，目前仅支持 F32
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F32);
+
+    auto result = CdistInfo::create(y_desc, x1_desc, x2_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        dtype, result.take(), p, 0,
+        nullptr,
+        handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+// --- Kernel: Generic P-Norm (MUSA F32 实现) ---
+__global__ void cdist_generic_kernel_f32(
+    float *y,
+    const float *x1,
+    const float *x2,
+    size_t m,
+    size_t n,
+    size_t d,
+    ptrdiff_t x1_stride,
+    ptrdiff_t x1_row_stride,
+    ptrdiff_t x1_col_stride,
+    ptrdiff_t x2_stride,
+    ptrdiff_t x2_row_stride,
+    ptrdiff_t x2_col_stride,
+    ptrdiff_t y_stride,
+    ptrdiff_t y_row_stride,
+    ptrdiff_t y_col_stride,
+    double p) {
+
+    // 2. MUSA 同样支持 3D 线程索引
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    int b = blockIdx.z;
+
+    if (i >= (int)m || j >= (int)n) {
+        return;
+    }
+
+    // 定位输出 y[b, i, j]
+    float *y_ptr = y + b * y_stride + i * y_row_stride + j * y_col_stride;
+
+    // 定位输入向量
+    const float *x1_vec = x1 + b * x1_stride + i * x1_row_stride;
+    const float *x2_vec = x2 + b * x2_stride + j * x2_row_stride;
+
+    double dist = 0.0;
+
+    for (size_t k = 0; k < d; ++k) {
+        float v1 = *(x1_vec + k * x1_col_stride);
+        float v2 = *(x2_vec + k * x2_col_stride);
+        float diff = fabsf(v1 - v2);
+
+        if (p == 1.0) {
+            dist += (double)diff;
+        } else if (p == 2.0) {
+            dist += (double)diff * diff;
+        } else if (isinf(p)) {
+            dist = fmaxf((float)dist, diff);
+        } else {
+            dist += powf(diff, (float)p);
+        }
+    }
+
+    if (p == 2.0) {
+        dist = sqrtf((float)dist);
+    } else if (!isinf(p) && p != 1.0) {
+        dist = powf((float)dist, 1.0f / (float)p);
+    }
+
+    *y_ptr = (float)dist;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x1,
+    const void *x2,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    if (_dtype != INFINI_DTYPE_F32) {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    // 3. 切换至 musaStream_t
+    musaStream_t mustream = reinterpret_cast<musaStream_t>(stream);
+
+    // 保持 16x16 的 Block 大小，这在 MUSA 架构上也是通用的
+    dim3 block(16, 16);
+    dim3 grid(
+        static_cast<unsigned int>((_info.n + block.x - 1) / block.x),
+        static_cast<unsigned int>((_info.m + block.y - 1) / block.y),
+        static_cast<unsigned int>(_info.batch));
+
+    cdist_generic_kernel_f32<<<grid, block, 0, mustream>>>(
+        static_cast<float *>(y),
+        static_cast<const float *>(x1),
+        static_cast<const float *>(x2),
+        _info.m,
+        _info.n,
+        _info.d,
+        _info.x1_matrix.stride,
+        _info.x1_matrix.row_stride,
+        _info.x1_matrix.col_stride,
+        _info.x2_matrix.stride,
+        _info.x2_matrix.row_stride,
+        _info.x2_matrix.col_stride,
+        _info.y_matrix.stride,
+        _info.y_matrix.row_stride,
+        _info.y_matrix.col_stride,
+        _p);
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::cdist::moore
--- a/src/infiniop/ops/cdist/nvidia/cdist_nvidia.cu
+++ b/src/infiniop/ops/cdist/nvidia/cdist_nvidia.cu
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "cdist_nvidia.cuh"
+#include <iostream>
+namespace op::cdist::nvidia {
+
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x1_desc,
+    infiniopTensorDescriptor_t x2_desc,
+    double p) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = y_desc->dtype();
+
+    // 目前 NVIDIA 后端仅支持 F32，测试也是 F32
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F32);
+
+    auto result = CdistInfo::create(y_desc, x1_desc, x2_desc);
+    CHECK_RESULT(result);
+
+    // 当前实现不使用 workspace
+    *desc_ptr = new Descriptor(
+        dtype, result.take(), p, 0,
+        nullptr,
+        handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+// --- Kernel 1: L2 Epilogue ---
+// 保留占位，当前实现未使用 GEMM 加速路径
+template <typename T>
+__global__ void cdist_l2_epilogue_kernel(T *y, const T *x1_norm, const T *x2_norm,
+                                         int M, int N, int batch_stride_y) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    int b = blockIdx.z;
+
+    if (i < M && j < N) {
+        int idx = b * batch_stride_y + i * N + j;
+        // GEMM 已经计算了 -2*x1*x2^T 并存入 y
+        float val = (float)x1_norm[b * M + i] + (float)x2_norm[b * N + j] + (float)y[idx];
+        y[idx] = (T)sqrtf(fmaxf(val, 0.0f));
+    }
+}
+
+// --- Kernel 2: Generic P-Norm (F32, 支持通用步长) ---
+__global__ void cdist_generic_kernel_f32(
+    float *y,
+    const float *x1,
+    const float *x2,
+    size_t m,
+    size_t n,
+    size_t d,
+    ptrdiff_t x1_stride,
+    ptrdiff_t x1_row_stride,
+    ptrdiff_t x1_col_stride,
+    ptrdiff_t x2_stride,
+    ptrdiff_t x2_row_stride,
+    ptrdiff_t x2_col_stride,
+    ptrdiff_t y_stride,
+    ptrdiff_t y_row_stride,
+    ptrdiff_t y_col_stride,
+    double p) {
+
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    int b = blockIdx.z;
+
+    if (i >= (int)m || j >= (int)n) {
+        return;
+    }
+
+    // 定位输出位置 y[b, i, j]
+    float *y_ptr = y + b * y_stride + i * y_row_stride + j * y_col_stride;
+
+    // 定位向量位置 x1[b, i, :] 和 x2[b, j, :]
+    const float *x1_vec = x1 + b * x1_stride + i * x1_row_stride;
+    const float *x2_vec = x2 + b * x2_stride + j * x2_row_stride;
+
+    double dist = 0.0;
+
+    for (size_t k = 0; k < d; ++k) {
+        float v1 = *(x1_vec + k * x1_col_stride);
+        float v2 = *(x2_vec + k * x2_col_stride);
+        float diff = fabsf(v1 - v2);
+
+        if (p == 1.0) {
+            dist += diff;
+        } else if (p == 2.0) {
+            dist += diff * diff;
+        } else if (isinf(p)) {
+            dist = fmaxf((float)dist, diff);
+        } else {
+            dist += powf((float)diff, (float)p);
+        }
+    }
+
+    if (p == 2.0) {
+        dist = sqrtf((float)dist);
+    } else if (!isinf(p) && p != 1.0) {
+        dist = powf((float)dist, 1.0f / (float)p);
+    }
+
+    *y_ptr = (float)dist;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x1,
+    const void *x2,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    if (_dtype != INFINI_DTYPE_F32) {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    cudaStream_t custream = (cudaStream_t)stream;
+    dim3 block(16, 16);
+    dim3 grid(
+        static_cast<unsigned int>((_info.n + block.x - 1) / block.x),
+        static_cast<unsigned int>((_info.m + block.y - 1) / block.y),
+        static_cast<unsigned int>(_info.batch));
+
+    cdist_generic_kernel_f32<<<grid, block, 0, custream>>>(
+        static_cast<float *>(y),
+        static_cast<const float *>(x1),
+        static_cast<const float *>(x2),
+        _info.m,
+        _info.n,
+        _info.d,
+        _info.x1_matrix.stride,
+        _info.x1_matrix.row_stride,
+        _info.x1_matrix.col_stride,
+        _info.x2_matrix.stride,
+        _info.x2_matrix.row_stride,
+        _info.x2_matrix.col_stride,
+        _info.y_matrix.stride,
+        _info.y_matrix.row_stride,
+        _info.y_matrix.col_stride,
+        _p);
+
+    auto err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        return INFINI_STATUS_INTERNAL_ERROR;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::cdist::nvidia
--- a/src/infiniop/ops/cdist/nvidia/cdist_nvidia.cuh
+++ b/src/infiniop/ops/cdist/nvidia/cdist_nvidia.cuh
+#ifndef __CDIST_NVIDIA_CUH__
+#define __CDIST_NVIDIA_CUH__
+
+#include "../cdist.h"
+
+/**
+ * 使用 cdist.h 中定义的 DESCRIPTOR 宏。
+ * 这将在命名空间 op::cdist::nvidia 中生成针对 NVIDIA 设备的 Descriptor 类。
+ * * 在 NVIDIA 端的具体实现中，Opaque 结构体通常会存储：
+ * - cublasHandle_t: 用于 p=2.0 时的矩阵乘法加速。
+ * - cudaStream_t: 当前执行的任务流。
+ * - 自定义 Kernel 的配置参数。
+ */
+DESCRIPTOR(nvidia)
+
+#endif // __CDIST_NVIDIA_CUH__
--- a/src/infiniop/ops/cdist/operator.cc
+++ b/src/infiniop/ops/cdist/operator.cc
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/cdist.h"
+
+// 引入各硬件后端的 Descriptor 定义
+#ifdef ENABLE_CPU_API
+#include "cpu/cdist_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
+#include "nvidia/cdist_nvidia.cuh"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/cdist_bang.h"
+#endif
+#ifdef ENABLE_ASCEND_API
+#include "ascend/cdist_ascend.h"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/cdist_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/cdist_moore.h"
+#endif
+#ifdef ENABLE_KUNLUN_API
+#include "kunlun/cdist_kunlun.h"
+#endif
+
+// -----------------------------------------------------------------------------
+// 1. 创建描述符
+// -----------------------------------------------------------------------------
+__INFINI_C infiniStatus_t infiniopCreateCdistDescriptor(
+    infiniopHandle_t handle,
+    infiniopCdistDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x1_desc,
+    infiniopTensorDescriptor_t x2_desc,
+    double p) {
+
+#define CREATE(CASE, NAMESPACE)                                                                                          \
+    case CASE:                                                                                                           \
+        return op::cdist::NAMESPACE::Descriptor::create(handle,                                                          \
+                                                        reinterpret_cast<op::cdist::NAMESPACE::Descriptor **>(desc_ptr), \
+                                                        y_desc, x1_desc, x2_desc, p)
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_HYGON_API
+        CREATE(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_ASCEND_API
+        CREATE(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CREATE
+}
+
+// -----------------------------------------------------------------------------
+// 2. 获取 Workspace 大小
+// -----------------------------------------------------------------------------
+__INFINI_C infiniStatus_t infiniopGetCdistWorkspaceSize(
+    infiniopCdistDescriptor_t desc,
+    size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                       \
+    case CASE:                                                                                     \
+        *size = reinterpret_cast<const op::cdist::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_HYGON_API
+        GET(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_ASCEND_API
+        GET(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        GET(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+}
+
+// -----------------------------------------------------------------------------
+// 3. 执行计算 (计算成对距离)
+// -----------------------------------------------------------------------------
+__INFINI_C infiniStatus_t infiniopCdist(
+    infiniopCdistDescriptor_t desc,
+    void *workspace, size_t workspace_size,
+    void *y,
+    const void *x1,
+    const void *x2,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                                  \
+        return reinterpret_cast<const op::cdist::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, x1, x2, stream)
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_HYGON_API
+        CALCULATE(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_ASCEND_API
+        CALCULATE(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CALCULATE
+}
+
+// -----------------------------------------------------------------------------
+// 4. 销毁描述符
+// -----------------------------------------------------------------------------
+__INFINI_C infiniStatus_t infiniopDestroyCdistDescriptor(infiniopCdistDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        delete reinterpret_cast<const op::cdist::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_HYGON_API
+        DELETE(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_ASCEND_API
+        DELETE(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef DELETE
+}
--- a/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc
+++ b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc
+#include "reciprocal_cpu.h"
+
+namespace op::reciprocal::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &y_desc = out_desc;
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = y_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    // Reciprocal typically only supports floating point types
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<ReciprocalOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<ReciprocalOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<ReciprocalOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<ReciprocalOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::reciprocal::cpu
--- a/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h
+++ b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h
+#ifndef __RECIPROCAL_CPU_H__
+#define __RECIPROCAL_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(reciprocal, cpu)
+
+namespace op::reciprocal::cpu {
+typedef struct ReciprocalOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    T operator()(const T &x) const {
+        return static_cast<T>(1) / x;
+    }
+} ReciprocalOp;
+} // namespace op::reciprocal::cpu
+
+#endif // __RECIPROCAL_CPU_H__
--- a/src/infiniop/ops/reciprocal/cuda/kernel.cuh
+++ b/src/infiniop/ops/reciprocal/cuda/kernel.cuh
+#ifndef __RECIPROCAL_CUDA_H__
+#define __RECIPROCAL_CUDA_H__
+
+namespace op::reciprocal::cuda {
+typedef struct ReciprocalOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return h2rcp(x);
+        } else if constexpr (std::is_same_v<T, half>) {
+            return hrcp(x);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            // bfloat16 does not have a direct hrcp intrinsic in some versions,
+            // often handled by converting to float or using specific bf16 intrinsics
+            return __float2bfloat16(1.0f / __bfloat162float(x));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return __frcp_rd(x);
+        } else {
+            return static_cast<T>(1) / x;
+        }
+    }
+} ReciprocalOp;
+} // namespace op::reciprocal::cuda
+
+#endif // __RECIPROCAL_CUDA_H__
--- a/src/infiniop/ops/reciprocal/metax/reciprocal_metax.h
+++ b/src/infiniop/ops/reciprocal/metax/reciprocal_metax.h
+#ifndef __RECIPROCAL_METAX_API_H__
+#define __RECIPROCAL_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(reciprocal, metax)
+
+#endif // __RECIPROCAL_METAX_API_H__
--- a/src/infiniop/ops/reciprocal/metax/reciprocal_metax.maca
+++ b/src/infiniop/ops/reciprocal/metax/reciprocal_metax.maca
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "reciprocal_metax.h"
+#include "reciprocal_metax_kernel.h"
+
+namespace op::reciprocal::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &y_desc = out_desc;
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = y_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    // Reciprocal typically only supports floating point types
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create METAX elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, metax::ReciprocalOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, metax::ReciprocalOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, metax::ReciprocalOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, metax::ReciprocalOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::reciprocal::metax
--- a/src/infiniop/ops/reciprocal/metax/reciprocal_metax_kernel.h
+++ b/src/infiniop/ops/reciprocal/metax/reciprocal_metax_kernel.h
+#ifndef __RECIPROCAL_METAX_KERNEL_H__
+#define __RECIPROCAL_METAX_KERNEL_H__
+
+/*
+ * This file contains the Reciprocal operation implementation for the MUSA backend.
+ *
+ * It follows the consistent code structure to ensure alignment across different
+ * hardware platforms within the Moore Threads (MUSA) ecosystem.
+ */
+namespace op::reciprocal::metax {
+
+typedef struct ReciprocalOp {
+public:
+    // 一元算子，输入数量为 1
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            // 使用 MUSA 的 half2 倒数指令（如果硬件支持）
+            // 或者转为 float2 进行计算
+            float2 f2 = __half22float2(a);
+            f2.x = 1.0f / f2.x;
+            f2.y = 1.0f / f2.y;
+            return __float22half2_rn(f2);
+        } else if constexpr (std::is_same_v<T, half>) {
+            // 提升到 float 计算以保证数值稳定性
+            return __float2half(1.0f / __half2float(a));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            // BF16 在 MUSA 上推荐转为 float 处理
+            float a_f = __bfloat162float(a);
+            return __float2bfloat16_rn(1.0f / a_f);
+        } else if constexpr (std::is_same_v<T, float>) {
+            // 编译器通常会将 1.0f/a 优化为硬件 rcp 指令 (Round to Nearest)
+            return 1.0f / a;
+        } else if constexpr (std::is_same_v<T, double>) {
+            return 1.0 / a;
+        } else {
+            // 整数类型倒数通常返回 0 (除 1 以外)，保持标准 C++ 行为
+            return static_cast<T>(1) / a;
+        }
+    }
+} ReciprocalOp;
+
+} // namespace op::reciprocal::metax
+
+#endif // __RECIPROCAL_METAX_KERNEL_H__
--- a/src/infiniop/ops/reciprocal/moore/reciprocal_moore.h
+++ b/src/infiniop/ops/reciprocal/moore/reciprocal_moore.h
+#ifndef __RECIPROCAL_MOORE_API_H__
+#define __RECIPROCAL_MOORE_API_H__
+
+// 1. 切换到 Moore 平台的 elementwise API 定义文件
+#include "../../../elementwise/moore/elementwise_moore_api.h"
+
+// 2. 调用宏生成 op::reciprocal::moore::Descriptor
+// 宏展开后会包含 create 和 calculate 的标准声明
+ELEMENTWISE_DESCRIPTOR(reciprocal, moore)
+
+#endif // __RECIPROCAL_MOORE_API_H__
--- a/src/infiniop/ops/reciprocal/moore/reciprocal_moore.mu
+++ b/src/infiniop/ops/reciprocal/moore/reciprocal_moore.mu
+#include "../../../elementwise/moore/elementwise_moore.h"
+
+#include "reciprocal_moore.h"
+#include "reciprocal_moore_kernel.h"
+
+namespace op::reciprocal::moore {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    // 1. 解析 Moore (MUSA) 句柄
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &y_desc = out_desc;
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = y_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    // 2. 校验数据类型：Moore 平台同样在浮点类型上执行倒数运算
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // 3. 使用 Moore 平台的 Elementwise 描述符创建宏
+    // 该宏会自动处理 MUSA 后端的算子元数据初始化
+    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    // 4. 分发至 Moore 特化的计算逻辑
+    // 注意：cuda::ReciprocalOp 替换为 moore::ReciprocalOp
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, moore::ReciprocalOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        // 确保使用 Moore 环境下的 bfloat16 类型定义
+        return _device_info->calculate<256, moore::ReciprocalOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, moore::ReciprocalOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, moore::ReciprocalOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::reciprocal::moore
--- a/src/infiniop/ops/reciprocal/moore/reciprocal_moore_kernel.h
+++ b/src/infiniop/ops/reciprocal/moore/reciprocal_moore_kernel.h
+#ifndef __RECIPROCAL_MOORE_KERNEL_H__
+#define __RECIPROCAL_MOORE_KERNEL_H__
+
+/*
+ * This file contains the Reciprocal operation implementation for the MUSA backend.
+ *
+ * It follows the consistent code structure to ensure alignment across different
+ * hardware platforms within the Moore Threads (MUSA) ecosystem.
+ */
+namespace op::reciprocal::moore {
+
+typedef struct ReciprocalOp {
+public:
+    // 一元算子，输入数量为 1
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            // 使用 MUSA 的 half2 倒数指令（如果硬件支持）
+            // 或者转为 float2 进行计算
+            float2 f2 = __half22float2(a);
+            f2.x = 1.0f / f2.x;
+            f2.y = 1.0f / f2.y;
+            return __float22half2_rn(f2);
+        } else if constexpr (std::is_same_v<T, half>) {
+            // 提升到 float 计算以保证数值稳定性
+            return __float2half(1.0f / __half2float(a));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            // BF16 在 MUSA 上推荐转为 float 处理
+            float a_f = __bfloat162float(a);
+            return __float2bfloat16_rn(1.0f / a_f);
+        } else if constexpr (std::is_same_v<T, float>) {
+            // 编译器通常会将 1.0f/a 优化为硬件 rcp 指令 (Round to Nearest)
+            return 1.0f / a;
+        } else if constexpr (std::is_same_v<T, double>) {
+            return 1.0 / a;
+        } else {
+            // 整数类型倒数通常返回 0 (除 1 以外)，保持标准 C++ 行为
+            return static_cast<T>(1) / a;
+        }
+    }
+} ReciprocalOp;
+
+} // namespace op::reciprocal::moore
+
+#endif // __RECIPROCAL_MOORE_KERNEL_H__
--- a/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu
+++ b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "reciprocal_nvidia.cuh"
+
+namespace op::reciprocal::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &y_desc = out_desc;
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = y_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    // Reciprocal typically only supports floating point types
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ReciprocalOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::ReciprocalOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ReciprocalOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::ReciprocalOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::reciprocal::nvidia
--- a/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cuh
+++ b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cuh
+#ifndef __RECIPROCAL_CUDA_API_H__
+#define __RECIPROCAL_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(reciprocal, nvidia)
+
+#endif // __RECIPROCAL_CUDA_API_H__