Merge pull request #1069 from InfiniTensor/issue/1031_T1_1_15

【算子比赛2025秋】T1-1-15

Merge pull request #1069 from InfiniTensor/issue/1031_T1_1_15
【算子比赛2025秋】T1-1-15
21c6af2d · thatPepe · GitHub · 99a802dd · 5f329d7a · 21c6af2d
Unverified Commit 21c6af2d authored Mar 11, 2026 by thatPepe Committed by GitHub Mar 11, 2026
20 changed files
--- a/src/infiniop-test/src/ops/binary_cross_entropy_with_logits.cpp
+++ b/src/infiniop-test/src/ops/binary_cross_entropy_with_logits.cpp
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::binary_cross_entropy_with_logits {
+
+struct Test::Attributes {
+    std::shared_ptr<Tensor> logits;
+    std::shared_ptr<Tensor> target;
+    std::shared_ptr<Tensor> weight;     // 可选
+    std::shared_ptr<Tensor> pos_weight; // 可选
+    std::shared_ptr<Tensor> out;
+    std::shared_ptr<Tensor> ans;
+    int reduction; // 0: none, 1: mean, 2: sum
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+
+    // 1. 校验必要张量是否存在
+    if (tensors.find("logits") == tensors.end() || tensors.find("target") == tensors.end() || tensors.find("out") == tensors.end() || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid BCE Test: Missing mandatory tensors");
+    }
+
+    // 2. 获取 reduction 属性 (默认为 1: mean)
+    test->_attributes->reduction = 1;
+    if (attributes.find("reduction") != attributes.end()) {
+        test->_attributes->reduction = *reinterpret_cast<int *>(attributes["reduction"].data());
+    }
+
+    // 3. 填充张量（处理可选张量）
+    test->_attributes->logits = tensors["logits"];
+    test->_attributes->target = tensors["target"];
+    test->_attributes->out = tensors["out"];
+    test->_attributes->ans = tensors["ans"];
+
+    // 如果 tensors 中存在则赋值，否则为 nullptr
+    test->_attributes->weight = tensors.count("weight") ? tensors["weight"] : nullptr;
+    test->_attributes->pos_weight = tensors.count("pos_weight") ? tensors["pos_weight"] : nullptr;
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+
+    infiniopBCEWithLogitsDescriptor_t op_desc;
+
+    // 4. 数据迁移
+    auto logits = _attributes->logits->to(device, device_id);
+    auto target = _attributes->target->to(device, device_id);
+    auto out = _attributes->out->to(device, device_id);
+
+    // 处理可选张量迁移
+    std::shared_ptr<Tensor> weight = (_attributes->weight) ? _attributes->weight->to(device, device_id) : nullptr;
+    std::shared_ptr<Tensor> pos_weight = (_attributes->pos_weight) ? _attributes->pos_weight->to(device, device_id) : nullptr;
+
+    // 5. 创建描述符 (注意处理 NULL 描述符)
+    auto w_desc = weight ? weight->desc() : nullptr;
+    auto pw_desc = pos_weight ? pos_weight->desc() : nullptr;
+
+    CHECK_OR(infiniopCreateBCEWithLogitsDescriptor(handle, &op_desc,
+                                                   out->desc(),
+                                                   logits->desc(),
+                                                   target->desc(),
+                                                   w_desc,
+                                                   pw_desc,
+                                                   static_cast<infiniopReduction_t>(_attributes->reduction)),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create BCE descriptor."));
+
+    // 6. Workspace 管理
+    size_t workspace_size;
+    CHECK_OR(infiniopGetBCEWithLogitsWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+
+    // 7. 执行计算
+    auto w_data = weight ? weight->data() : nullptr;
+    auto pw_data = pos_weight ? pos_weight->data() : nullptr;
+
+    CHECK_OR(infiniopBCEWithLogits(op_desc, workspace, workspace_size,
+                                   out->data(),
+                                   logits->data(),
+                                   target->data(),
+                                   w_data,
+                                   pw_data,
+                                   nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    // 8. 结果验证
+    try {
+        allClose(out, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    // 9. 性能 Benchmark
+    double elapsed_time = benchmark(
+        [=]() {
+            infiniopBCEWithLogits(op_desc, workspace, workspace_size,
+                                  out->data(), logits->data(), target->data(),
+                                  w_data, pw_data, nullptr);
+        },
+        warm_ups, iterations);
+
+    // 10. 资源清理
+    infinirtFree(workspace);
+    infiniopDestroyBCEWithLogitsDescriptor(op_desc);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {"reduction"};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"logits", "target", "weight", "pos_weight", "out", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"out"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- reduction: " << _attributes->reduction << std::endl;
+    oss << "- logits: " << _attributes->logits->info() << std::endl;
+    if (_attributes->weight) {
+        oss << "- weight: " << _attributes->weight->info() << std::endl;
+    }
+    oss << "- out: " << _attributes->out->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::binary_cross_entropy_with_logits
--- a/src/infiniop-test/src/ops/cdist.cpp
+++ b/src/infiniop-test/src/ops/cdist.cpp
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::cdist {
+
+struct Test::Attributes {
+    std::shared_ptr<Tensor> x1;
+    std::shared_ptr<Tensor> x2;
+    std::shared_ptr<Tensor> out;
+    std::shared_ptr<Tensor> ans;
+    double p;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+
+    // 1. 校验张量是否存在 (x1, x2, out, ans)
+    if (tensors.find("x1") == tensors.end() || tensors.find("x2") == tensors.end() || tensors.find("out") == tensors.end() || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Cdist Test: Missing tensors");
+    }
+
+    // 2. 获取标量属性 p (注意 cdist 通常用 double)
+    test->_attributes->p = 2.0; // 默认值
+    if (attributes.find("p") != attributes.end()) {
+        test->_attributes->p = *reinterpret_cast<double *>(attributes["p"].data());
+    }
+
+    test->_attributes->x1 = tensors["x1"];
+    test->_attributes->x2 = tensors["x2"];
+    test->_attributes->out = tensors["out"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+
+    infiniopCdistDescriptor_t op_desc;
+
+    // 3. 数据迁移至指定设备 (M x D, N x D)
+    auto x1 = _attributes->x1->to(device, device_id);
+    auto x2 = _attributes->x2->to(device, device_id);
+    auto out = _attributes->out->to(device, device_id);
+
+    // 4. 创建算子描述符
+    CHECK_OR(infiniopCreateCdistDescriptor(handle, &op_desc,
+                                           out->desc(),
+                                           x1->desc(),
+                                           x2->desc(),
+                                           _attributes->p),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create cdist descriptor."));
+
+    // 5. Workspace 动态内存分配
+    size_t workspace_size;
+    CHECK_OR(infiniopGetCdistWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+
+    // 6. 执行计算 (计算 M x N 距离矩阵)
+    CHECK_OR(infiniopCdist(op_desc, workspace, workspace_size,
+                           out->data(),
+                           x1->data(),
+                           x2->data(),
+                           nullptr), // stream
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    // 7. 结果数值验证
+    try {
+        allClose(out, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    // 8. 性能 Benchmark
+    double elapsed_time = benchmark(
+        [=]() {
+            infiniopCdist(op_desc, workspace, workspace_size,
+                          out->data(),
+                          x1->data(),
+                          x2->data(),
+                          nullptr);
+        },
+        warm_ups, iterations);
+
+    // 9. 资源清理
+    infinirtFree(workspace);
+    infiniopDestroyCdistDescriptor(op_desc);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {"p"};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"x1", "x2", "out", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"out"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- p: " << _attributes->p << std::endl;
+    oss << "- x1: " << _attributes->x1->info() << std::endl;
+    oss << "- x2: " << _attributes->x2->info() << std::endl;
+    oss << "- out: " << _attributes->out->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::cdist
--- a/src/infiniop-test/src/ops/reciprocal.cpp
+++ b/src/infiniop-test/src/ops/reciprocal.cpp
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::reciprocal {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> x;
+    std::shared_ptr<Tensor> y;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("x") == tensors.end()
+        || tensors.find("y") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->x = tensors["x"];
+    test->_attributes->y = tensors["y"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopReciprocalDescriptor_t op_desc;
+    auto x = _attributes->x->to(device, device_id);
+    auto y = _attributes->y->to(device, device_id);
+    CHECK_OR(infiniopCreateReciprocalDescriptor(handle, &op_desc,
+                                                y->desc(),
+                                                x->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+
+    size_t workspace_size;
+    CHECK_OR(infiniopGetReciprocalWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+
+    CHECK_OR(infiniopReciprocal(op_desc, workspace, workspace_size,
+                                y->data(),
+                                x->data(),
+                                nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(y, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopReciprocal(
+                op_desc, workspace, workspace_size,
+                y->data(),
+                x->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"x", "y", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"y"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- x: " << _attributes->x->info() << std::endl;
+    oss << "- y: " << _attributes->y->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::reciprocal
--- a/src/infiniop/ops/addcmul/cpu/addcmul_cpu.cc
+++ b/src/infiniop/ops/addcmul/cpu/addcmul_cpu.cc
+#include "addcmul_cpu.h"
+
+namespace op::addcmul::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec,
+    float value) { // 额外接收 value 参数
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    // 1. 类型检查
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    // 2. 形状检查 (仿照 atanh，这里至少检查第一个输入)
+    const auto &y_shape = out_desc->shape();
+    for (const auto &in_desc : input_desc_vec) {
+        CHECK_SAME_SHAPE(y_shape, in_desc->shape());
+    }
+
+    // 3. 使用通用的 Elementwise 宏创建描述符
+    // 该宏会实例化 Descriptor 并将其赋值给 *desc_ptr
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    // 4. 将标量属性 value 存入 Descriptor 内部
+    (*desc_ptr)->_value = value;
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    // 仿照 atanh，使用 switch 分发不同数据类型
+    // 这里的模板参数是 AddcmulOp，它在 addcmul_cpu.h 中定义
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<AddcmulOp, fp16_t>(_info, output, inputs, stream, _value);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<AddcmulOp, float>(_info, output, inputs, stream, _value);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<AddcmulOp, double>(_info, output, inputs, stream, _value);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<AddcmulOp, bf16_t>(_info, output, inputs, stream, _value);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::addcmul::cpu
--- a/src/infiniop/ops/addcmul/cpu/addcmul_cpu.h
+++ b/src/infiniop/ops/addcmul/cpu/addcmul_cpu.h
+#ifndef __ADDCMUL_CPU_H__
+#define __ADDCMUL_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
+#include <type_traits>
+
+namespace op::addcmul::cpu {
+
+struct AddcmulOp {
+public:
+    // addcmul 是三元算子: out = input + value * t1 * t2
+    static constexpr size_t num_inputs = 3;
+
+    template <typename T, typename Scalar>
+    T operator()(const T &input, const T &t1, const T &t2, Scalar value) const {
+        // 对于 float, double 等原生浮点类型
+        if constexpr (std::is_floating_point_v<T>) {
+            return input + static_cast<T>(value) * t1 * t2;
+        } else {
+            // 对于 fp16, bf16 等类型，提升至 float 计算以保证精度并处理标量乘法
+            float f_input = static_cast<float>(input);
+            float f_t1 = static_cast<float>(t1);
+            float f_t2 = static_cast<float>(t2);
+            float v = static_cast<float>(value);
+            return static_cast<T>(f_input + v * f_t1 * f_t2);
+        }
+    }
+};
+
+// 为 addcmul 在 CPU 端自定义 Descriptor，支持额外的标量参数 value
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    op::elementwise::ElementwiseInfo _info;
+    std::unique_ptr<op::elementwise::cpu::DeviceImpl> _device_info;
+    size_t _workspace_size;
+    float _value; // 标量系数 value
+
+    Descriptor(
+        infiniDtype_t dtype,
+        op::elementwise::ElementwiseInfo info,
+        op::elementwise::cpu::DeviceImpl *device_info,
+        size_t workspace_size,
+        infiniDevice_t device_type,
+        int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          _info(std::move(info)),
+          _device_info(device_info),
+          _workspace_size(workspace_size),
+          _value(0.0f) {}
+
+public:
+    ~Descriptor();
+
+    size_t workspaceSize() const { return _workspace_size; }
+
+    // 额外接收 value 参数
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t output_desc,
+        std::vector<infiniopTensorDescriptor_t> input_descs,
+        float value);
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *output,
+        std::vector<const void *> inputs,
+        void *stream) const;
+
+    float getValue() const { return _value; }
+};
+
+} // namespace op::addcmul::cpu
+
+#endif // __ADDCMUL_CPU_H__
--- a/src/infiniop/ops/addcmul/cuda/kernel.cuh
+++ b/src/infiniop/ops/addcmul/cuda/kernel.cuh
+#ifndef __ADDCMUL_CUDA_CUH__
+#define __ADDCMUL_CUDA_CUH__
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <type_traits>
+
+namespace op::addcmul::cuda {
+
+struct AddcmulOp {
+public:
+    // addcmul 是三元算子：out = input + value * t1 * t2
+    static constexpr size_t num_inputs = 3;
+
+    template <typename T>
+    __device__ __host__ __forceinline__ T operator()(const T &input, const T &t1, const T &t2, float value) const {
+        float v = value;
+        if constexpr (std::is_same_v<T, half>) {
+            // 提升至 float 计算以保证精度并简化标量乘法
+            float f_input = __half2float(input);
+            float f_t1 = __half2float(t1);
+            float f_t2 = __half2float(t2);
+            return __float2half(f_input + v * f_t1 * f_t2);
+
+        } else if constexpr (std::is_same_v<T, nv_bfloat16>) {
+            float f_input = __bfloat162float(input);
+            float f_t1 = __bfloat162float(t1);
+            float f_t2 = __bfloat162float(t2);
+            return __float2bfloat16(f_input + v * f_t1 * f_t2);
+
+        } else if constexpr (std::is_same_v<T, float>) {
+            return input + v * t1 * t2;
+
+        } else if constexpr (std::is_same_v<T, double>) {
+            return input + static_cast<double>(v) * t1 * t2;
+
+        } else {
+            // 兜底逻辑
+            return static_cast<T>(static_cast<float>(input) + v * static_cast<float>(t1) * static_cast<float>(t2));
+        }
+    }
+};
+
+} // namespace op::addcmul::cuda
+
+#endif // __ADDCMUL_CUDA_CUH__
--- a/src/infiniop/ops/addcmul/metax/addcmul_metax.h
+++ b/src/infiniop/ops/addcmul/metax/addcmul_metax.h
+#ifndef __ADDCMUL_METAX_H__
+#define __ADDCMUL_METAX_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+namespace op::addcmul::metax {
+
+// 为 addcmul 在 METAX 端自定义 Descriptor，支持额外的标量参数 value
+class Descriptor final : public InfiniopDescriptor {
+    // 为保持与通用 Elementwise 框架的兼容，仍然保留这些成员
+    infiniDtype_t _dtype;
+    op::elementwise::ElementwiseInfo _info;
+    std::unique_ptr<op::elementwise::metax::DeviceImpl> _device_info;
+    size_t _workspace_size;
+    float _value; // 标量系数 value
+
+public:
+    // 为自定义 CUDA kernel 记录张量元信息
+    static constexpr int MAX_NDIM = 8;
+
+    struct TensorMeta {
+        int ndim;
+        size_t shape[MAX_NDIM];
+        ptrdiff_t strides[MAX_NDIM];
+    };
+
+    TensorMeta _out_meta{};
+    TensorMeta _input_meta{};
+    TensorMeta _t1_meta{};
+    TensorMeta _t2_meta{};
+    size_t _output_size{0};
+
+    Descriptor(
+        infiniDtype_t dtype,
+        op::elementwise::ElementwiseInfo info,
+        op::elementwise::metax::DeviceImpl *device_info,
+        size_t workspace_size,
+        infiniDevice_t device_type,
+        int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          _info(std::move(info)),
+          _device_info(device_info),
+          _workspace_size(workspace_size),
+          _value(0.0f) {}
+
+public:
+    ~Descriptor();
+
+    size_t workspaceSize() const { return _workspace_size; }
+
+    // 额外接收 value 参数
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t output_desc,
+        std::vector<infiniopTensorDescriptor_t> input_descs,
+        float value);
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *output,
+        std::vector<const void *> inputs,
+        void *stream) const;
+
+    float getValue() const { return _value; }
+};
+
+} // namespace op::addcmul::metax
+
+#endif // __ADDCMUL_METAX_H__
--- a/src/infiniop/ops/addcmul/metax/addcmul_metax.maca
+++ b/src/infiniop/ops/addcmul/metax/addcmul_metax.maca
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "addcmul_metax.h"
+#include "addcmul_metax_kernel.h"
+
+namespace op::addcmul::metax {
+
+Descriptor::~Descriptor() = default;
+
+// 将 TensorDescriptor 中的 shape/strides 填充到 TensorMeta 结构中
+static inline infiniStatus_t fill_tensor_meta(
+    infiniopTensorDescriptor_t desc,
+    Descriptor::TensorMeta &meta) {
+
+    auto ndim = desc->ndim();
+    if (ndim > Descriptor::MAX_NDIM) {
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    }
+
+    meta.ndim = static_cast<int>(ndim);
+    const auto &shape = desc->shape();
+    const auto &strides = desc->strides();
+    for (int i = 0; i < meta.ndim; ++i) {
+        meta.shape[i] = shape[i];
+        meta.strides[i] = strides[i];
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec,
+    float value) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    // 1. 类型检查
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64);
+
+    // 2. 形状检查：要求输出与三个输入形状一致（若不支持广播）
+    const auto &out_shape = out_desc->shape();
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &t1_desc = input_desc_vec.at(1);
+    const auto &t2_desc = input_desc_vec.at(2);
+    CHECK_SAME_SHAPE(out_shape, input_desc->shape());
+    CHECK_SAME_SHAPE(out_shape, t1_desc->shape());
+    CHECK_SAME_SHAPE(out_shape, t2_desc->shape());
+
+    // 3. 创建底层的 Elementwise METAX 描述符
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    // 4. 记录张量元信息和输出元素个数，供自定义 METAX kernel 使用
+    auto *desc = *desc_ptr;
+    desc->_output_size = out_desc->numel();
+
+    CHECK_STATUS(fill_tensor_meta(out_desc, desc->_out_meta));
+    CHECK_STATUS(fill_tensor_meta(input_desc, desc->_input_meta));
+    CHECK_STATUS(fill_tensor_meta(t1_desc, desc->_t1_meta));
+    CHECK_STATUS(fill_tensor_meta(t2_desc, desc->_t2_meta));
+
+    // 5. 将标量属性 value 存入 Descriptor 内部
+    desc->_value = value;
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+// 自定义 addcmul METAX kernel：使用 Descriptor 中的 TensorMeta 做通用 strided 访问
+template <typename T>
+INFINIOP_METAX_KERNEL addcmul_kernel(
+    size_t output_size,
+    Descriptor::TensorMeta out_meta,
+    Descriptor::TensorMeta in_meta,
+    Descriptor::TensorMeta t1_meta,
+    Descriptor::TensorMeta t2_meta,
+    T *out,
+    const T *input,
+    const T *t1,
+    const T *t2,
+    float value) {
+
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= output_size) {
+        return;
+    }
+
+    // 根据输出 shape/stride 计算各个张量的偏移
+    ptrdiff_t out_offset = 0;
+    ptrdiff_t in_offset = 0;
+    ptrdiff_t t1_offset = 0;
+    ptrdiff_t t2_offset = 0;
+
+    size_t linear = idx;
+    for (int dim = out_meta.ndim - 1; dim >= 0; --dim) {
+        size_t dim_size = out_meta.shape[dim];
+        size_t coord = linear % dim_size;
+        linear /= dim_size;
+
+        out_offset += static_cast<ptrdiff_t>(coord) * out_meta.strides[dim];
+        in_offset += static_cast<ptrdiff_t>(coord) * in_meta.strides[dim];
+        t1_offset += static_cast<ptrdiff_t>(coord) * t1_meta.strides[dim];
+        t2_offset += static_cast<ptrdiff_t>(coord) * t2_meta.strides[dim];
+    }
+
+    T in_val = input[in_offset];
+    T t1_val = t1[t1_offset];
+    T t2_val = t2[t2_offset];
+
+    out[out_offset] = op::addcmul::metax::AddcmulOp{}(in_val, t1_val, t2_val, value);
+}
+
+template <typename T>
+static inline infiniStatus_t launch_addcmul_kernel(
+    const Descriptor *desc,
+    void *output,
+    const std::vector<const void *> &inputs,
+    void *stream) {
+
+    size_t output_size = desc->_output_size;
+    if (output_size == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    auto *out_ptr = reinterpret_cast<T *>(output);
+    auto *in_ptr = reinterpret_cast<const T *>(inputs.at(0));
+    auto *t1_ptr = reinterpret_cast<const T *>(inputs.at(1));
+    auto *t2_ptr = reinterpret_cast<const T *>(inputs.at(2));
+
+    mcStream_t metax_stream = reinterpret_cast<mcStream_t>(stream);
+
+    constexpr uint32_t BLOCK_SIZE = 256;
+    uint32_t grid = static_cast<uint32_t>((output_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
+
+    addcmul_kernel<T><<<grid, BLOCK_SIZE, 0, metax_stream>>>(
+        output_size,
+        desc->_out_meta,
+        desc->_input_meta,
+        desc->_t1_meta,
+        desc->_t2_meta,
+        out_ptr,
+        in_ptr,
+        t1_ptr,
+        t2_ptr,
+        desc->getValue());
+
+    CHECK_METAX(mcGetLastError());
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    // 目前不依赖 workspace 内容，只检查大小是否足够以保持与其他算子一致的接口语义
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    // 直接调用自定义 METAX kernel，避免通过通用 elementwise 框架
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return launch_addcmul_kernel<half>(this, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return launch_addcmul_kernel<cuda_bfloat16>(this, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return launch_addcmul_kernel<float>(this, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return launch_addcmul_kernel<double>(this, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+} // namespace op::addcmul::metax
--- a/src/infiniop/ops/addcmul/metax/addcmul_metax_kernel.h
+++ b/src/infiniop/ops/addcmul/metax/addcmul_metax_kernel.h
+#ifndef __ADDCMUL_METAX_KERNEL_H__
+#define __ADDCMUL_METAX_KERNEL_H__
+
+/*
+ * This file contains the Addcmul operation implementation for the MUSA backend.
+ * Formula: out = input + value * tensor1 * tensor2
+ */
+
+namespace op::addcmul::metax {
+
+typedef struct AddcmulOp {
+public:
+    // 三元算子，输入为 input, tensor1, tensor2
+    static constexpr size_t num_inputs = 3;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &in, const T &t1, const T &t2, float value) const {
+        if constexpr (std::is_same_v<T, float>) {
+            // F32 直接使用乘加指令
+            return in + value * t1 * t2;
+        } else if constexpr (std::is_same_v<T, half>) {
+            // F16 提升到 float 计算以防止中间乘法溢出
+            float f_in = __half2float(in);
+            float f_t1 = __half2float(t1);
+            float f_t2 = __half2float(t2);
+            return __float2half(f_in + value * f_t1 * f_t2);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            // BF16 同样提升到 float 计算
+            float f_in = __bfloat162float(in);
+            float f_t1 = __bfloat162float(t1);
+            float f_t2 = __bfloat162float(t2);
+            return __float2bfloat16_rn(f_in + value * f_t1 * f_t2);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return in + (double)value * t1 * t2;
+        } else {
+            // 整数类型或其他类型
+            return in + static_cast<T>(value) * t1 * t2;
+        }
+    }
+} AddcmulOp;
+
+} // namespace op::addcmul::metax
+
+#endif // __ADDCMUL_METAX_KERNEL_H__
--- a/src/infiniop/ops/addcmul/moore/addcmul_moore.h
+++ b/src/infiniop/ops/addcmul/moore/addcmul_moore.h
+#ifndef __ADDCMUL_MOORE_H__
+#define __ADDCMUL_MOORE_H__
+
+// 1. 切换到 Moore 平台的 Elementwise API
+#include "../../../elementwise/moore/elementwise_moore_api.h"
+
+namespace op::addcmul::moore {
+
+/**
+ * 为 addcmul 在 Moore 端自定义 Descriptor
+ * 保持与 NVIDIA 版本一致的结构，以便于跨平台对齐
+ */
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    op::elementwise::ElementwiseInfo _info;
+    // 2. 切换到 Moore 设备的实现指针
+    std::unique_ptr<op::elementwise::moore::DeviceImpl> _device_info;
+    size_t _workspace_size;
+    float _value; // 标量系数 value
+
+public:
+    // 摩尔线程 MUSA 同样支持 stride 访问，记录张量元信息
+    static constexpr int MAX_NDIM = 8;
+
+    struct TensorMeta {
+        int ndim;
+        size_t shape[MAX_NDIM];
+        ptrdiff_t strides[MAX_NDIM];
+    };
+
+    TensorMeta _out_meta{};
+    TensorMeta _input_meta{};
+    TensorMeta _t1_meta{};
+    TensorMeta _t2_meta{};
+    size_t _output_size{0};
+
+    Descriptor(
+        infiniDtype_t dtype,
+        op::elementwise::ElementwiseInfo info,
+        op::elementwise::moore::DeviceImpl *device_info, // 3. 修改构造函数参数类型
+        size_t workspace_size,
+        infiniDevice_t device_type,
+        int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          _info(std::move(info)),
+          _device_info(device_info),
+          _workspace_size(workspace_size),
+          _value(0.0f) {}
+
+public:
+    ~Descriptor();
+
+    size_t workspaceSize() const { return _workspace_size; }
+
+    // 4. 保持相同的接口，接收 value 参数
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t output_desc,
+        std::vector<infiniopTensorDescriptor_t> input_descs,
+        float value);
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *output,
+        std::vector<const void *> inputs,
+        void *stream) const;
+
+    float getValue() const { return _value; }
+};
+
+} // namespace op::addcmul::moore
+
+#endif // __ADDCMUL_MOORE_H__
--- a/src/infiniop/ops/addcmul/moore/addcmul_moore.mu
+++ b/src/infiniop/ops/addcmul/moore/addcmul_moore.mu
+#include "../../../elementwise/moore/elementwise_moore.h"
+#include "addcmul_moore.h"
+#include "addcmul_moore_kernel.h"
+#include <musa_runtime.h>
+
+namespace op::addcmul::moore {
+
+Descriptor::~Descriptor() = default;
+
+// 1. 填充 TensorMeta，逻辑与 NVIDIA 一致，用于 MUSA Kernel 中的 Strided 寻址
+static inline infiniStatus_t fill_tensor_meta(
+    infiniopTensorDescriptor_t desc,
+    Descriptor::TensorMeta &meta) {
+
+    auto ndim = desc->ndim();
+    if (ndim > Descriptor::MAX_NDIM) {
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    }
+
+    meta.ndim = static_cast<int>(ndim);
+    const auto &shape = desc->shape();
+    const auto &strides = desc->strides();
+    for (int i = 0; i < meta.ndim; ++i) {
+        meta.shape[i] = shape[i];
+        meta.strides[i] = strides[i];
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec,
+    float value) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    // 类型检查
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64);
+
+    // 形状检查 (A, T1, T2 需一致)
+    const auto &out_shape = out_desc->shape();
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &t1_desc = input_desc_vec.at(1);
+    const auto &t2_desc = input_desc_vec.at(2);
+    CHECK_SAME_SHAPE(out_shape, input_desc->shape());
+    CHECK_SAME_SHAPE(out_shape, t1_desc->shape());
+    CHECK_SAME_SHAPE(out_shape, t2_desc->shape());
+
+    // 2. 调用 Moore 平台的描述符创建宏
+    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    auto *desc = *desc_ptr;
+    desc->_output_size = out_desc->numel();
+
+    // 填充元数据
+    CHECK_STATUS(fill_tensor_meta(out_desc, desc->_out_meta));
+    CHECK_STATUS(fill_tensor_meta(input_desc, desc->_input_meta));
+    CHECK_STATUS(fill_tensor_meta(t1_desc, desc->_t1_meta));
+    CHECK_STATUS(fill_tensor_meta(t2_desc, desc->_t2_meta));
+
+    desc->_value = value;
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+// 3. MUSA Kernel 实现：逻辑保持一致
+template <typename T>
+__global__ void addcmul_kernel(
+    size_t output_size,
+    Descriptor::TensorMeta out_meta,
+    Descriptor::TensorMeta in_meta,
+    Descriptor::TensorMeta t1_meta,
+    Descriptor::TensorMeta t2_meta,
+    T *out,
+    const T *input,
+    const T *t1,
+    const T *t2,
+    float value) {
+
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= output_size) {
+        return;
+    }
+
+    ptrdiff_t out_offset = 0, in_offset = 0, t1_offset = 0, t2_offset = 0;
+    size_t linear = idx;
+
+    // 通用多维索引转偏移逻辑
+    for (int dim = out_meta.ndim - 1; dim >= 0; --dim) {
+        size_t dim_size = out_meta.shape[dim];
+        size_t coord = linear % dim_size;
+        linear /= dim_size;
+
+        out_offset += static_cast<ptrdiff_t>(coord) * out_meta.strides[dim];
+        in_offset += static_cast<ptrdiff_t>(coord) * in_meta.strides[dim];
+        t1_offset += static_cast<ptrdiff_t>(coord) * t1_meta.strides[dim];
+        t2_offset += static_cast<ptrdiff_t>(coord) * t2_meta.strides[dim];
+    }
+
+    // 调用 Moore 平台定义的 AddcmulOp
+    out[out_offset] = op::addcmul::moore::AddcmulOp{}(input[in_offset], t1[t1_offset], t2[t2_offset], value);
+}
+
+// 4. 内核启动封装
+template <typename T>
+static inline infiniStatus_t launch_addcmul_kernel(
+    const Descriptor *desc,
+    void *output,
+    const std::vector<const void *> &inputs,
+    void *stream) {
+
+    size_t output_size = desc->_output_size;
+    if (output_size == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    auto *out_ptr = reinterpret_cast<T *>(output);
+    auto *in_ptr = reinterpret_cast<const T *>(inputs.at(0));
+    auto *t1_ptr = reinterpret_cast<const T *>(inputs.at(1));
+    auto *t2_ptr = reinterpret_cast<const T *>(inputs.at(2));
+
+    musaStream_t musa_stream = reinterpret_cast<musaStream_t>(stream);
+
+    constexpr uint32_t BLOCK_SIZE = 256;
+    uint32_t grid = static_cast<uint32_t>((output_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
+
+    addcmul_kernel<T><<<grid, BLOCK_SIZE, 0, musa_stream>>>(
+        output_size, desc->_out_meta, desc->_input_meta, desc->_t1_meta, desc->_t2_meta,
+        out_ptr, in_ptr, t1_ptr, t2_ptr, desc->getValue());
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return launch_addcmul_kernel<half>(this, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        // 使用 Moore 平台对应的 bf16 类型
+        return launch_addcmul_kernel<cuda_bfloat16>(this, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return launch_addcmul_kernel<float>(this, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return launch_addcmul_kernel<double>(this, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+} // namespace op::addcmul::moore
--- a/src/infiniop/ops/addcmul/moore/addcmul_moore_kernel.h
+++ b/src/infiniop/ops/addcmul/moore/addcmul_moore_kernel.h
+#ifndef __ADDCMUL_MOORE_KERNEL_H__
+#define __ADDCMUL_MOORE_KERNEL_H__
+
+/*
+ * This file contains the Addcmul operation implementation for the MUSA backend.
+ * Formula: out = input + value * tensor1 * tensor2
+ */
+
+namespace op::addcmul::moore {
+
+typedef struct AddcmulOp {
+public:
+    // 三元算子，输入为 input, tensor1, tensor2
+    static constexpr size_t num_inputs = 3;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &in, const T &t1, const T &t2, float value) const {
+        if constexpr (std::is_same_v<T, float>) {
+            // F32 直接使用乘加指令
+            return in + value * t1 * t2;
+        } else if constexpr (std::is_same_v<T, half>) {
+            // F16 提升到 float 计算以防止中间乘法溢出
+            float f_in = __half2float(in);
+            float f_t1 = __half2float(t1);
+            float f_t2 = __half2float(t2);
+            return __float2half(f_in + value * f_t1 * f_t2);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            // BF16 同样提升到 float 计算
+            float f_in = __bfloat162float(in);
+            float f_t1 = __bfloat162float(t1);
+            float f_t2 = __bfloat162float(t2);
+            return __float2bfloat16_rn(f_in + value * f_t1 * f_t2);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return in + (double)value * t1 * t2;
+        } else {
+            // 整数类型或其他类型
+            return in + static_cast<T>(value) * t1 * t2;
+        }
+    }
+} AddcmulOp;
+
+} // namespace op::addcmul::moore
+
+#endif // __ADDCMUL_MOORE_KERNEL_H__
--- a/src/infiniop/ops/addcmul/nvidia/addcmul_nvidia.cu
+++ b/src/infiniop/ops/addcmul/nvidia/addcmul_nvidia.cu
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../cuda/kernel.cuh"
+#include "addcmul_nvidia.cuh"
+
+namespace op::addcmul::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+// 将 TensorDescriptor 中的 shape/strides 填充到 TensorMeta 结构中
+static inline infiniStatus_t fill_tensor_meta(
+    infiniopTensorDescriptor_t desc,
+    Descriptor::TensorMeta &meta) {
+
+    auto ndim = desc->ndim();
+    if (ndim > Descriptor::MAX_NDIM) {
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    }
+
+    meta.ndim = static_cast<int>(ndim);
+    const auto &shape = desc->shape();
+    const auto &strides = desc->strides();
+    for (int i = 0; i < meta.ndim; ++i) {
+        meta.shape[i] = shape[i];
+        meta.strides[i] = strides[i];
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec,
+    float value) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    // 1. 类型检查
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64);
+
+    // 2. 形状检查：要求输出与三个输入形状一致（若不支持广播）
+    const auto &out_shape = out_desc->shape();
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &t1_desc = input_desc_vec.at(1);
+    const auto &t2_desc = input_desc_vec.at(2);
+    CHECK_SAME_SHAPE(out_shape, input_desc->shape());
+    CHECK_SAME_SHAPE(out_shape, t1_desc->shape());
+    CHECK_SAME_SHAPE(out_shape, t2_desc->shape());
+
+    // 3. 创建底层的 Elementwise CUDA 描述符
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    // 4. 记录张量元信息和输出元素个数，供自定义 CUDA kernel 使用
+    auto *desc = *desc_ptr;
+    desc->_output_size = out_desc->numel();
+
+    CHECK_STATUS(fill_tensor_meta(out_desc, desc->_out_meta));
+    CHECK_STATUS(fill_tensor_meta(input_desc, desc->_input_meta));
+    CHECK_STATUS(fill_tensor_meta(t1_desc, desc->_t1_meta));
+    CHECK_STATUS(fill_tensor_meta(t2_desc, desc->_t2_meta));
+
+    // 5. 将标量属性 value 存入 Descriptor 内部
+    desc->_value = value;
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+// 自定义 addcmul CUDA kernel：使用 Descriptor 中的 TensorMeta 做通用 strided 访问
+template <typename T>
+INFINIOP_CUDA_KERNEL addcmul_kernel(
+    size_t output_size,
+    Descriptor::TensorMeta out_meta,
+    Descriptor::TensorMeta in_meta,
+    Descriptor::TensorMeta t1_meta,
+    Descriptor::TensorMeta t2_meta,
+    T *out,
+    const T *input,
+    const T *t1,
+    const T *t2,
+    float value) {
+
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= output_size) {
+        return;
+    }
+
+    // 根据输出 shape/stride 计算各个张量的偏移
+    ptrdiff_t out_offset = 0;
+    ptrdiff_t in_offset = 0;
+    ptrdiff_t t1_offset = 0;
+    ptrdiff_t t2_offset = 0;
+
+    size_t linear = idx;
+    for (int dim = out_meta.ndim - 1; dim >= 0; --dim) {
+        size_t dim_size = out_meta.shape[dim];
+        size_t coord = linear % dim_size;
+        linear /= dim_size;
+
+        out_offset += static_cast<ptrdiff_t>(coord) * out_meta.strides[dim];
+        in_offset += static_cast<ptrdiff_t>(coord) * in_meta.strides[dim];
+        t1_offset += static_cast<ptrdiff_t>(coord) * t1_meta.strides[dim];
+        t2_offset += static_cast<ptrdiff_t>(coord) * t2_meta.strides[dim];
+    }
+
+    T in_val = input[in_offset];
+    T t1_val = t1[t1_offset];
+    T t2_val = t2[t2_offset];
+
+    out[out_offset] = op::addcmul::cuda::AddcmulOp{}(in_val, t1_val, t2_val, value);
+}
+
+template <typename T>
+static inline infiniStatus_t launch_addcmul_kernel(
+    const Descriptor *desc,
+    void *output,
+    const std::vector<const void *> &inputs,
+    void *stream) {
+
+    size_t output_size = desc->_output_size;
+    if (output_size == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    auto *out_ptr = reinterpret_cast<T *>(output);
+    auto *in_ptr = reinterpret_cast<const T *>(inputs.at(0));
+    auto *t1_ptr = reinterpret_cast<const T *>(inputs.at(1));
+    auto *t2_ptr = reinterpret_cast<const T *>(inputs.at(2));
+
+    cudaStream_t cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+    constexpr uint32_t BLOCK_SIZE = 256;
+    uint32_t grid = static_cast<uint32_t>((output_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
+
+    addcmul_kernel<T><<<grid, BLOCK_SIZE, 0, cuda_stream>>>(
+        output_size,
+        desc->_out_meta,
+        desc->_input_meta,
+        desc->_t1_meta,
+        desc->_t2_meta,
+        out_ptr,
+        in_ptr,
+        t1_ptr,
+        t2_ptr,
+        desc->getValue());
+
+    CHECK_CUDA(cudaGetLastError());
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    // 目前不依赖 workspace 内容，只检查大小是否足够以保持与其他算子一致的接口语义
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    // 直接调用自定义 CUDA kernel，避免通过通用 elementwise 框架
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return launch_addcmul_kernel<half>(this, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return launch_addcmul_kernel<nv_bfloat16>(this, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return launch_addcmul_kernel<float>(this, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return launch_addcmul_kernel<double>(this, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+} // namespace op::addcmul::nvidia
--- a/src/infiniop/ops/addcmul/nvidia/addcmul_nvidia.cuh
+++ b/src/infiniop/ops/addcmul/nvidia/addcmul_nvidia.cuh
+#ifndef __ADDCMUL_NVIDIA_H__
+#define __ADDCMUL_NVIDIA_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+namespace op::addcmul::nvidia {
+
+// 为 addcmul 在 NVIDIA 端自定义 Descriptor，支持额外的标量参数 value
+class Descriptor final : public InfiniopDescriptor {
+    // 为保持与通用 Elementwise 框架的兼容，仍然保留这些成员
+    infiniDtype_t _dtype;
+    op::elementwise::ElementwiseInfo _info;
+    std::unique_ptr<op::elementwise::nvidia::DeviceImpl> _device_info;
+    size_t _workspace_size;
+    float _value; // 标量系数 value
+
+public:
+    // 为自定义 CUDA kernel 记录张量元信息
+    static constexpr int MAX_NDIM = 8;
+
+    struct TensorMeta {
+        int ndim;
+        size_t shape[MAX_NDIM];
+        ptrdiff_t strides[MAX_NDIM];
+    };
+
+    TensorMeta _out_meta{};
+    TensorMeta _input_meta{};
+    TensorMeta _t1_meta{};
+    TensorMeta _t2_meta{};
+    size_t _output_size{0};
+
+    Descriptor(
+        infiniDtype_t dtype,
+        op::elementwise::ElementwiseInfo info,
+        op::elementwise::nvidia::DeviceImpl *device_info,
+        size_t workspace_size,
+        infiniDevice_t device_type,
+        int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          _info(std::move(info)),
+          _device_info(device_info),
+          _workspace_size(workspace_size),
+          _value(0.0f) {}
+
+public:
+    ~Descriptor();
+
+    size_t workspaceSize() const { return _workspace_size; }
+
+    // 额外接收 value 参数
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t output_desc,
+        std::vector<infiniopTensorDescriptor_t> input_descs,
+        float value);
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *output,
+        std::vector<const void *> inputs,
+        void *stream) const;
+
+    float getValue() const { return _value; }
+};
+
+} // namespace op::addcmul::nvidia
+
+#endif // __ADDCMUL_NVIDIA_H__
--- a/src/infiniop/ops/addcmul/operator.cc
+++ b/src/infiniop/ops/addcmul/operator.cc
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/addcmul.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/addcmul_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/addcmul_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/addcmul_metax.h"
+#endif
+#ifdef ENABLE_KUNLUN_API
+#include "kunlun/addcmul_kunlun.h"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/addcmul_bang.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/addcmul_moore.h"
+#endif
+
+__INFINI_C infiniStatus_t infiniopCreateAddcmulDescriptor(
+    infiniopHandle_t handle,
+    infiniopAddcmulDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t t1_desc,
+    infiniopTensorDescriptor_t t2_desc,
+    float value) {
+
+#define CREATE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        return op::addcmul::NAMESPACE::Descriptor::create(                     \
+            handle,                                                            \
+            reinterpret_cast<op::addcmul::NAMESPACE::Descriptor **>(desc_ptr), \
+            out_desc,                                                          \
+            {input_desc, t1_desc, t2_desc},                                    \
+            value)
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CREATE
+}
+
+__INFINI_C infiniStatus_t infiniopGetAddcmulWorkspaceSize(infiniopAddcmulDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                   \
+    case CASE:                                                                                 \
+        *size = reinterpret_cast<op::addcmul::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        GET(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+}
+
+__INFINI_C infiniStatus_t infiniopAddcmul(
+    infiniopAddcmulDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *out,
+    const void *input,
+    const void *t1,
+    const void *t2,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                    \
+        return reinterpret_cast<const op::addcmul::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, out, {input, t1, t2}, stream)
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CALCULATE
+}
+
+__INFINI_C infiniStatus_t infiniopDestroyAddcmulDescriptor(infiniopAddcmulDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                    \
+    case CASE:                                                                     \
+        delete reinterpret_cast<const op::addcmul::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef DELETE
+}
--- a/src/infiniop/ops/atanh/cpu/atanh_cpu.cc
+++ b/src/infiniop/ops/atanh/cpu/atanh_cpu.cc
+#include "atanh_cpu.h"
+
+namespace op::atanh::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(y_shape, a_shape);
+
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    // 分发到对应的数据类型进行计算，模板参数为我们在 atanh_cpu.h 中定义的 AtanhOp
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<AtanhOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<AtanhOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<AtanhOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<AtanhOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::atanh::cpu
--- a/src/infiniop/ops/atanh/cpu/atanh_cpu.h
+++ b/src/infiniop/ops/atanh/cpu/atanh_cpu.h
+#ifndef __ATANH_CPU_H__
+#define __ATANH_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
+#include <type_traits>
+
+// 注册 atanh 算子在 cpu 后端的 descriptor
+ELEMENTWISE_DESCRIPTOR(atanh, cpu)
+
+namespace op::atanh::cpu {
+typedef struct AtanhOp {
+public:
+    // atanh 是一元算子
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &a) const {
+        // 对于 float, double 等原生支持的类型直接调用 std::atanh
+        if constexpr (std::is_floating_point_v<T>) {
+            return std::atanh(a);
+        } else {
+            // 对于 half, bfloat16 等自定义类型，先转为 float 计算再转回
+            // 假设这些类型支持 static_cast 到 float
+            return static_cast<T>(std::atanhf(static_cast<float>(a)));
+        }
+    }
+} AtanhOp;
+} // namespace op::atanh::cpu
+
+#endif // __ATANH_CPU_H__
--- a/src/infiniop/ops/atanh/cuda/kernel.cuh
+++ b/src/infiniop/ops/atanh/cuda/kernel.cuh
+#ifndef __ATANH_CUDA_H__
+#define __ATANH_CUDA_H__
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+namespace op::atanh::cuda {
+typedef struct AtanhOp {
+public:
+    // atanh 是一元算子，只需要一个输入
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            // 对 half2 的两个部分分别求 atanh
+            float2 f = __half22float2(a);
+            f.x = atanhf(f.x);
+            f.y = atanhf(f.y);
+            return __float22half2_rn(f);
+        } else if constexpr (std::is_same_v<T, half>) {
+            // half 类型先转为 float 计算再转回
+            return __float2half(atanhf(__half2float(a)));
+        } else if constexpr (std::is_same_v<T, nv_bfloat16>) {
+            // bfloat16 类型处理同上
+            return __float2bfloat16(atanhf(__bfloat162float(a)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            // float 直接调用标准数学库函数
+            return atanhf(a);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return ::atanh(a);
+        } else {
+            // 其他整数类型或不支持类型理论上不应进入，此处做简单 fallback
+            return static_cast<T>(atanhf(static_cast<float>(a)));
+        }
+    }
+} AtanhOp;
+} // namespace op::atanh::cuda
+
+#endif // __ATANH_CUDA_H__
--- a/src/infiniop/ops/atanh/metax/atanh_metax.h
+++ b/src/infiniop/ops/atanh/metax/atanh_metax.h
+#ifndef __ATANH_METAX_API_H__
+#define __ATANH_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(atanh, metax)
+
+#endif // __ATANH_METAX_API_H__
--- a/src/infiniop/ops/atanh/metax/atanh_metax.maca
+++ b/src/infiniop/ops/atanh/metax/atanh_metax.maca
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "atanh_metax.h"
+#include "atanh_metax_kernel.h"
+
+namespace op::atanh::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(y_shape, a_shape);
+
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, metax::AtanhOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, metax::AtanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, metax::AtanhOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, metax::AtanhOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::atanh::metax