Unverified Commit 21c6af2d authored by thatPepe's avatar thatPepe Committed by GitHub
Browse files

Merge pull request #1069 from InfiniTensor/issue/1031_T1_1_15

【算子比赛2025秋】T1-1-15
parents 99a802dd 5f329d7a
#include "ops.hpp"
#include "utils.hpp"
#include <infinirt.h>
#include <iomanip>
#include <iostream>
namespace infiniop_test::binary_cross_entropy_with_logits {
struct Test::Attributes {
std::shared_ptr<Tensor> logits;
std::shared_ptr<Tensor> target;
std::shared_ptr<Tensor> weight; // 可选
std::shared_ptr<Tensor> pos_weight; // 可选
std::shared_ptr<Tensor> out;
std::shared_ptr<Tensor> ans;
int reduction; // 0: none, 1: mean, 2: sum
};
std::shared_ptr<Test> Test::build(
std::unordered_map<std::string, std::vector<uint8_t>> attributes,
std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
double rtol, double atol) {
auto test = std::shared_ptr<Test>(new Test(rtol, atol));
test->_attributes = new Attributes();
// 1. 校验必要张量是否存在
if (tensors.find("logits") == tensors.end() || tensors.find("target") == tensors.end() || tensors.find("out") == tensors.end() || tensors.find("ans") == tensors.end()) {
throw std::runtime_error("Invalid BCE Test: Missing mandatory tensors");
}
// 2. 获取 reduction 属性 (默认为 1: mean)
test->_attributes->reduction = 1;
if (attributes.find("reduction") != attributes.end()) {
test->_attributes->reduction = *reinterpret_cast<int *>(attributes["reduction"].data());
}
// 3. 填充张量(处理可选张量)
test->_attributes->logits = tensors["logits"];
test->_attributes->target = tensors["target"];
test->_attributes->out = tensors["out"];
test->_attributes->ans = tensors["ans"];
// 如果 tensors 中存在则赋值,否则为 nullptr
test->_attributes->weight = tensors.count("weight") ? tensors["weight"] : nullptr;
test->_attributes->pos_weight = tensors.count("pos_weight") ? tensors["pos_weight"] : nullptr;
return test;
}
std::shared_ptr<infiniop_test::Result> Test::run(
infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
infiniopBCEWithLogitsDescriptor_t op_desc;
// 4. 数据迁移
auto logits = _attributes->logits->to(device, device_id);
auto target = _attributes->target->to(device, device_id);
auto out = _attributes->out->to(device, device_id);
// 处理可选张量迁移
std::shared_ptr<Tensor> weight = (_attributes->weight) ? _attributes->weight->to(device, device_id) : nullptr;
std::shared_ptr<Tensor> pos_weight = (_attributes->pos_weight) ? _attributes->pos_weight->to(device, device_id) : nullptr;
// 5. 创建描述符 (注意处理 NULL 描述符)
auto w_desc = weight ? weight->desc() : nullptr;
auto pw_desc = pos_weight ? pos_weight->desc() : nullptr;
CHECK_OR(infiniopCreateBCEWithLogitsDescriptor(handle, &op_desc,
out->desc(),
logits->desc(),
target->desc(),
w_desc,
pw_desc,
static_cast<infiniopReduction_t>(_attributes->reduction)),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to create BCE descriptor."));
// 6. Workspace 管理
size_t workspace_size;
CHECK_OR(infiniopGetBCEWithLogitsWorkspaceSize(op_desc, &workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
void *workspace;
CHECK_OR(infinirtMalloc(&workspace, workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
// 7. 执行计算
auto w_data = weight ? weight->data() : nullptr;
auto pw_data = pos_weight ? pos_weight->data() : nullptr;
CHECK_OR(infiniopBCEWithLogits(op_desc, workspace, workspace_size,
out->data(),
logits->data(),
target->data(),
w_data,
pw_data,
nullptr),
return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
// 8. 结果验证
try {
allClose(out, _attributes->ans, _rtol, _atol);
} catch (const std::exception &e) {
return TEST_FAILED(RESULT_INCORRECT, e.what());
}
// 9. 性能 Benchmark
double elapsed_time = benchmark(
[=]() {
infiniopBCEWithLogits(op_desc, workspace, workspace_size,
out->data(), logits->data(), target->data(),
w_data, pw_data, nullptr);
},
warm_ups, iterations);
// 10. 资源清理
infinirtFree(workspace);
infiniopDestroyBCEWithLogitsDescriptor(op_desc);
return TEST_PASSED(elapsed_time);
}
std::vector<std::string> Test::attribute_names() {
return {"reduction"};
}
std::vector<std::string> Test::tensor_names() {
return {"logits", "target", "weight", "pos_weight", "out", "ans"};
}
std::vector<std::string> Test::output_names() {
return {"out"};
}
std::string Test::toString() const {
std::ostringstream oss;
oss << op_name() << std::endl;
oss << "- reduction: " << _attributes->reduction << std::endl;
oss << "- logits: " << _attributes->logits->info() << std::endl;
if (_attributes->weight) {
oss << "- weight: " << _attributes->weight->info() << std::endl;
}
oss << "- out: " << _attributes->out->info() << std::endl;
oss << std::scientific << std::setprecision(2);
oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
return oss.str();
}
Test::~Test() {
delete _attributes;
}
} // namespace infiniop_test::binary_cross_entropy_with_logits
#include "ops.hpp"
#include "utils.hpp"
#include <infinirt.h>
#include <iomanip>
#include <iostream>
namespace infiniop_test::cdist {
struct Test::Attributes {
std::shared_ptr<Tensor> x1;
std::shared_ptr<Tensor> x2;
std::shared_ptr<Tensor> out;
std::shared_ptr<Tensor> ans;
double p;
};
std::shared_ptr<Test> Test::build(
std::unordered_map<std::string, std::vector<uint8_t>> attributes,
std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
double rtol, double atol) {
auto test = std::shared_ptr<Test>(new Test(rtol, atol));
test->_attributes = new Attributes();
// 1. 校验张量是否存在 (x1, x2, out, ans)
if (tensors.find("x1") == tensors.end() || tensors.find("x2") == tensors.end() || tensors.find("out") == tensors.end() || tensors.find("ans") == tensors.end()) {
throw std::runtime_error("Invalid Cdist Test: Missing tensors");
}
// 2. 获取标量属性 p (注意 cdist 通常用 double)
test->_attributes->p = 2.0; // 默认值
if (attributes.find("p") != attributes.end()) {
test->_attributes->p = *reinterpret_cast<double *>(attributes["p"].data());
}
test->_attributes->x1 = tensors["x1"];
test->_attributes->x2 = tensors["x2"];
test->_attributes->out = tensors["out"];
test->_attributes->ans = tensors["ans"];
return test;
}
std::shared_ptr<infiniop_test::Result> Test::run(
infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
infiniopCdistDescriptor_t op_desc;
// 3. 数据迁移至指定设备 (M x D, N x D)
auto x1 = _attributes->x1->to(device, device_id);
auto x2 = _attributes->x2->to(device, device_id);
auto out = _attributes->out->to(device, device_id);
// 4. 创建算子描述符
CHECK_OR(infiniopCreateCdistDescriptor(handle, &op_desc,
out->desc(),
x1->desc(),
x2->desc(),
_attributes->p),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to create cdist descriptor."));
// 5. Workspace 动态内存分配
size_t workspace_size;
CHECK_OR(infiniopGetCdistWorkspaceSize(op_desc, &workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
void *workspace;
CHECK_OR(infinirtMalloc(&workspace, workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
// 6. 执行计算 (计算 M x N 距离矩阵)
CHECK_OR(infiniopCdist(op_desc, workspace, workspace_size,
out->data(),
x1->data(),
x2->data(),
nullptr), // stream
return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
// 7. 结果数值验证
try {
allClose(out, _attributes->ans, _rtol, _atol);
} catch (const std::exception &e) {
return TEST_FAILED(RESULT_INCORRECT, e.what());
}
// 8. 性能 Benchmark
double elapsed_time = benchmark(
[=]() {
infiniopCdist(op_desc, workspace, workspace_size,
out->data(),
x1->data(),
x2->data(),
nullptr);
},
warm_ups, iterations);
// 9. 资源清理
infinirtFree(workspace);
infiniopDestroyCdistDescriptor(op_desc);
return TEST_PASSED(elapsed_time);
}
std::vector<std::string> Test::attribute_names() {
return {"p"};
}
std::vector<std::string> Test::tensor_names() {
return {"x1", "x2", "out", "ans"};
}
std::vector<std::string> Test::output_names() {
return {"out"};
}
std::string Test::toString() const {
std::ostringstream oss;
oss << op_name() << std::endl;
oss << "- p: " << _attributes->p << std::endl;
oss << "- x1: " << _attributes->x1->info() << std::endl;
oss << "- x2: " << _attributes->x2->info() << std::endl;
oss << "- out: " << _attributes->out->info() << std::endl;
oss << std::scientific << std::setprecision(2);
oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
return oss.str();
}
Test::~Test() {
delete _attributes;
}
} // namespace infiniop_test::cdist
#include "ops.hpp"
#include "utils.hpp"
#include <infinirt.h>
#include <iomanip>
#include <iostream>
namespace infiniop_test::reciprocal {
struct Test::Attributes {
std::shared_ptr<Tensor> x;
std::shared_ptr<Tensor> y;
std::shared_ptr<Tensor> ans;
};
std::shared_ptr<Test> Test::build(
std::unordered_map<std::string, std::vector<uint8_t>> attributes,
std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
double rtol, double atol) {
auto test = std::shared_ptr<Test>(new Test(rtol, atol));
test->_attributes = new Attributes();
if (tensors.find("x") == tensors.end()
|| tensors.find("y") == tensors.end()
|| tensors.find("ans") == tensors.end()) {
throw std::runtime_error("Invalid Test");
}
test->_attributes->x = tensors["x"];
test->_attributes->y = tensors["y"];
test->_attributes->ans = tensors["ans"];
return test;
}
std::shared_ptr<infiniop_test::Result> Test::run(
infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
infiniopReciprocalDescriptor_t op_desc;
auto x = _attributes->x->to(device, device_id);
auto y = _attributes->y->to(device, device_id);
CHECK_OR(infiniopCreateReciprocalDescriptor(handle, &op_desc,
y->desc(),
x->desc()),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
size_t workspace_size;
CHECK_OR(infiniopGetReciprocalWorkspaceSize(op_desc, &workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
void *workspace;
CHECK_OR(infinirtMalloc(&workspace, workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
CHECK_OR(infiniopReciprocal(op_desc, workspace, workspace_size,
y->data(),
x->data(),
nullptr),
return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
try {
allClose(y, _attributes->ans, _rtol, _atol);
} catch (const std::exception &e) {
return TEST_FAILED(RESULT_INCORRECT, e.what());
}
double elapsed_time = 0.;
elapsed_time = benchmark(
[=]() {
infiniopReciprocal(
op_desc, workspace, workspace_size,
y->data(),
x->data(),
nullptr);
},
warm_ups, iterations);
return TEST_PASSED(elapsed_time);
}
std::vector<std::string> Test::attribute_names() {
return {};
}
std::vector<std::string> Test::tensor_names() {
return {"x", "y", "ans"};
}
std::vector<std::string> Test::output_names() {
return {"y"};
}
std::string Test::toString() const {
std::ostringstream oss;
oss << op_name() << std::endl;
oss << "- x: " << _attributes->x->info() << std::endl;
oss << "- y: " << _attributes->y->info() << std::endl;
oss << std::scientific << std::setprecision(2);
oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
return oss.str();
}
Test::~Test() {
delete _attributes;
}
} // namespace infiniop_test::reciprocal
#include "addcmul_cpu.h"
namespace op::addcmul::cpu {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec,
float value) { // 额外接收 value 参数
auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
auto dtype = out_desc->dtype();
// 1. 类型检查
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
// 2. 形状检查 (仿照 atanh,这里至少检查第一个输入)
const auto &y_shape = out_desc->shape();
for (const auto &in_desc : input_desc_vec) {
CHECK_SAME_SHAPE(y_shape, in_desc->shape());
}
// 3. 使用通用的 Elementwise 宏创建描述符
// 该宏会实例化 Descriptor 并将其赋值给 *desc_ptr
CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
// 4. 将标量属性 value 存入 Descriptor 内部
(*desc_ptr)->_value = value;
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
// 仿照 atanh,使用 switch 分发不同数据类型
// 这里的模板参数是 AddcmulOp,它在 addcmul_cpu.h 中定义
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<AddcmulOp, fp16_t>(_info, output, inputs, stream, _value);
case INFINI_DTYPE_F32:
return _device_info->calculate<AddcmulOp, float>(_info, output, inputs, stream, _value);
case INFINI_DTYPE_F64:
return _device_info->calculate<AddcmulOp, double>(_info, output, inputs, stream, _value);
case INFINI_DTYPE_BF16:
return _device_info->calculate<AddcmulOp, bf16_t>(_info, output, inputs, stream, _value);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::addcmul::cpu
#ifndef __ADDCMUL_CPU_H__
#define __ADDCMUL_CPU_H__
#include "../../../elementwise/cpu/elementwise_cpu.h"
#include <cmath>
#include <type_traits>
namespace op::addcmul::cpu {
struct AddcmulOp {
public:
// addcmul 是三元算子: out = input + value * t1 * t2
static constexpr size_t num_inputs = 3;
template <typename T, typename Scalar>
T operator()(const T &input, const T &t1, const T &t2, Scalar value) const {
// 对于 float, double 等原生浮点类型
if constexpr (std::is_floating_point_v<T>) {
return input + static_cast<T>(value) * t1 * t2;
} else {
// 对于 fp16, bf16 等类型,提升至 float 计算以保证精度并处理标量乘法
float f_input = static_cast<float>(input);
float f_t1 = static_cast<float>(t1);
float f_t2 = static_cast<float>(t2);
float v = static_cast<float>(value);
return static_cast<T>(f_input + v * f_t1 * f_t2);
}
}
};
// 为 addcmul 在 CPU 端自定义 Descriptor,支持额外的标量参数 value
class Descriptor final : public InfiniopDescriptor {
infiniDtype_t _dtype;
op::elementwise::ElementwiseInfo _info;
std::unique_ptr<op::elementwise::cpu::DeviceImpl> _device_info;
size_t _workspace_size;
float _value; // 标量系数 value
Descriptor(
infiniDtype_t dtype,
op::elementwise::ElementwiseInfo info,
op::elementwise::cpu::DeviceImpl *device_info,
size_t workspace_size,
infiniDevice_t device_type,
int device_id)
: InfiniopDescriptor{device_type, device_id},
_dtype(dtype),
_info(std::move(info)),
_device_info(device_info),
_workspace_size(workspace_size),
_value(0.0f) {}
public:
~Descriptor();
size_t workspaceSize() const { return _workspace_size; }
// 额外接收 value 参数
static infiniStatus_t create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t output_desc,
std::vector<infiniopTensorDescriptor_t> input_descs,
float value);
infiniStatus_t calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const;
float getValue() const { return _value; }
};
} // namespace op::addcmul::cpu
#endif // __ADDCMUL_CPU_H__
#ifndef __ADDCMUL_CUDA_CUH__
#define __ADDCMUL_CUDA_CUH__
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <type_traits>
namespace op::addcmul::cuda {
struct AddcmulOp {
public:
// addcmul 是三元算子:out = input + value * t1 * t2
static constexpr size_t num_inputs = 3;
template <typename T>
__device__ __host__ __forceinline__ T operator()(const T &input, const T &t1, const T &t2, float value) const {
float v = value;
if constexpr (std::is_same_v<T, half>) {
// 提升至 float 计算以保证精度并简化标量乘法
float f_input = __half2float(input);
float f_t1 = __half2float(t1);
float f_t2 = __half2float(t2);
return __float2half(f_input + v * f_t1 * f_t2);
} else if constexpr (std::is_same_v<T, nv_bfloat16>) {
float f_input = __bfloat162float(input);
float f_t1 = __bfloat162float(t1);
float f_t2 = __bfloat162float(t2);
return __float2bfloat16(f_input + v * f_t1 * f_t2);
} else if constexpr (std::is_same_v<T, float>) {
return input + v * t1 * t2;
} else if constexpr (std::is_same_v<T, double>) {
return input + static_cast<double>(v) * t1 * t2;
} else {
// 兜底逻辑
return static_cast<T>(static_cast<float>(input) + v * static_cast<float>(t1) * static_cast<float>(t2));
}
}
};
} // namespace op::addcmul::cuda
#endif // __ADDCMUL_CUDA_CUH__
#ifndef __ADDCMUL_METAX_H__
#define __ADDCMUL_METAX_H__
#include "../../../elementwise/metax/elementwise_metax_api.h"
namespace op::addcmul::metax {
// 为 addcmul 在 METAX 端自定义 Descriptor,支持额外的标量参数 value
class Descriptor final : public InfiniopDescriptor {
// 为保持与通用 Elementwise 框架的兼容,仍然保留这些成员
infiniDtype_t _dtype;
op::elementwise::ElementwiseInfo _info;
std::unique_ptr<op::elementwise::metax::DeviceImpl> _device_info;
size_t _workspace_size;
float _value; // 标量系数 value
public:
// 为自定义 CUDA kernel 记录张量元信息
static constexpr int MAX_NDIM = 8;
struct TensorMeta {
int ndim;
size_t shape[MAX_NDIM];
ptrdiff_t strides[MAX_NDIM];
};
TensorMeta _out_meta{};
TensorMeta _input_meta{};
TensorMeta _t1_meta{};
TensorMeta _t2_meta{};
size_t _output_size{0};
Descriptor(
infiniDtype_t dtype,
op::elementwise::ElementwiseInfo info,
op::elementwise::metax::DeviceImpl *device_info,
size_t workspace_size,
infiniDevice_t device_type,
int device_id)
: InfiniopDescriptor{device_type, device_id},
_dtype(dtype),
_info(std::move(info)),
_device_info(device_info),
_workspace_size(workspace_size),
_value(0.0f) {}
public:
~Descriptor();
size_t workspaceSize() const { return _workspace_size; }
// 额外接收 value 参数
static infiniStatus_t create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t output_desc,
std::vector<infiniopTensorDescriptor_t> input_descs,
float value);
infiniStatus_t calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const;
float getValue() const { return _value; }
};
} // namespace op::addcmul::metax
#endif // __ADDCMUL_METAX_H__
#include "../../../elementwise/metax/elementwise_metax.h"
#include "addcmul_metax.h"
#include "addcmul_metax_kernel.h"
namespace op::addcmul::metax {
Descriptor::~Descriptor() = default;
// 将 TensorDescriptor 中的 shape/strides 填充到 TensorMeta 结构中
static inline infiniStatus_t fill_tensor_meta(
infiniopTensorDescriptor_t desc,
Descriptor::TensorMeta &meta) {
auto ndim = desc->ndim();
if (ndim > Descriptor::MAX_NDIM) {
return INFINI_STATUS_NOT_IMPLEMENTED;
}
meta.ndim = static_cast<int>(ndim);
const auto &shape = desc->shape();
const auto &strides = desc->strides();
for (int i = 0; i < meta.ndim; ++i) {
meta.shape[i] = shape[i];
meta.strides[i] = strides[i];
}
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec,
float value) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto dtype = out_desc->dtype();
// 1. 类型检查
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64);
// 2. 形状检查:要求输出与三个输入形状一致(若不支持广播)
const auto &out_shape = out_desc->shape();
const auto &input_desc = input_desc_vec.at(0);
const auto &t1_desc = input_desc_vec.at(1);
const auto &t2_desc = input_desc_vec.at(2);
CHECK_SAME_SHAPE(out_shape, input_desc->shape());
CHECK_SAME_SHAPE(out_shape, t1_desc->shape());
CHECK_SAME_SHAPE(out_shape, t2_desc->shape());
// 3. 创建底层的 Elementwise METAX 描述符
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
// 4. 记录张量元信息和输出元素个数,供自定义 METAX kernel 使用
auto *desc = *desc_ptr;
desc->_output_size = out_desc->numel();
CHECK_STATUS(fill_tensor_meta(out_desc, desc->_out_meta));
CHECK_STATUS(fill_tensor_meta(input_desc, desc->_input_meta));
CHECK_STATUS(fill_tensor_meta(t1_desc, desc->_t1_meta));
CHECK_STATUS(fill_tensor_meta(t2_desc, desc->_t2_meta));
// 5. 将标量属性 value 存入 Descriptor 内部
desc->_value = value;
return INFINI_STATUS_SUCCESS;
}
// 自定义 addcmul METAX kernel:使用 Descriptor 中的 TensorMeta 做通用 strided 访问
template <typename T>
INFINIOP_METAX_KERNEL addcmul_kernel(
size_t output_size,
Descriptor::TensorMeta out_meta,
Descriptor::TensorMeta in_meta,
Descriptor::TensorMeta t1_meta,
Descriptor::TensorMeta t2_meta,
T *out,
const T *input,
const T *t1,
const T *t2,
float value) {
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= output_size) {
return;
}
// 根据输出 shape/stride 计算各个张量的偏移
ptrdiff_t out_offset = 0;
ptrdiff_t in_offset = 0;
ptrdiff_t t1_offset = 0;
ptrdiff_t t2_offset = 0;
size_t linear = idx;
for (int dim = out_meta.ndim - 1; dim >= 0; --dim) {
size_t dim_size = out_meta.shape[dim];
size_t coord = linear % dim_size;
linear /= dim_size;
out_offset += static_cast<ptrdiff_t>(coord) * out_meta.strides[dim];
in_offset += static_cast<ptrdiff_t>(coord) * in_meta.strides[dim];
t1_offset += static_cast<ptrdiff_t>(coord) * t1_meta.strides[dim];
t2_offset += static_cast<ptrdiff_t>(coord) * t2_meta.strides[dim];
}
T in_val = input[in_offset];
T t1_val = t1[t1_offset];
T t2_val = t2[t2_offset];
out[out_offset] = op::addcmul::metax::AddcmulOp{}(in_val, t1_val, t2_val, value);
}
template <typename T>
static inline infiniStatus_t launch_addcmul_kernel(
const Descriptor *desc,
void *output,
const std::vector<const void *> &inputs,
void *stream) {
size_t output_size = desc->_output_size;
if (output_size == 0) {
return INFINI_STATUS_SUCCESS;
}
auto *out_ptr = reinterpret_cast<T *>(output);
auto *in_ptr = reinterpret_cast<const T *>(inputs.at(0));
auto *t1_ptr = reinterpret_cast<const T *>(inputs.at(1));
auto *t2_ptr = reinterpret_cast<const T *>(inputs.at(2));
mcStream_t metax_stream = reinterpret_cast<mcStream_t>(stream);
constexpr uint32_t BLOCK_SIZE = 256;
uint32_t grid = static_cast<uint32_t>((output_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
addcmul_kernel<T><<<grid, BLOCK_SIZE, 0, metax_stream>>>(
output_size,
desc->_out_meta,
desc->_input_meta,
desc->_t1_meta,
desc->_t2_meta,
out_ptr,
in_ptr,
t1_ptr,
t2_ptr,
desc->getValue());
CHECK_METAX(mcGetLastError());
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
// 目前不依赖 workspace 内容,只检查大小是否足够以保持与其他算子一致的接口语义
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
// 直接调用自定义 METAX kernel,避免通过通用 elementwise 框架
switch (_dtype) {
case INFINI_DTYPE_F16:
return launch_addcmul_kernel<half>(this, output, inputs, stream);
case INFINI_DTYPE_BF16:
return launch_addcmul_kernel<cuda_bfloat16>(this, output, inputs, stream);
case INFINI_DTYPE_F32:
return launch_addcmul_kernel<float>(this, output, inputs, stream);
case INFINI_DTYPE_F64:
return launch_addcmul_kernel<double>(this, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
} // namespace op::addcmul::metax
#ifndef __ADDCMUL_METAX_KERNEL_H__
#define __ADDCMUL_METAX_KERNEL_H__
/*
* This file contains the Addcmul operation implementation for the MUSA backend.
* Formula: out = input + value * tensor1 * tensor2
*/
namespace op::addcmul::metax {
typedef struct AddcmulOp {
public:
// 三元算子,输入为 input, tensor1, tensor2
static constexpr size_t num_inputs = 3;
template <typename T>
__device__ __forceinline__ T operator()(const T &in, const T &t1, const T &t2, float value) const {
if constexpr (std::is_same_v<T, float>) {
// F32 直接使用乘加指令
return in + value * t1 * t2;
} else if constexpr (std::is_same_v<T, half>) {
// F16 提升到 float 计算以防止中间乘法溢出
float f_in = __half2float(in);
float f_t1 = __half2float(t1);
float f_t2 = __half2float(t2);
return __float2half(f_in + value * f_t1 * f_t2);
} else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
// BF16 同样提升到 float 计算
float f_in = __bfloat162float(in);
float f_t1 = __bfloat162float(t1);
float f_t2 = __bfloat162float(t2);
return __float2bfloat16_rn(f_in + value * f_t1 * f_t2);
} else if constexpr (std::is_same_v<T, double>) {
return in + (double)value * t1 * t2;
} else {
// 整数类型或其他类型
return in + static_cast<T>(value) * t1 * t2;
}
}
} AddcmulOp;
} // namespace op::addcmul::metax
#endif // __ADDCMUL_METAX_KERNEL_H__
#ifndef __ADDCMUL_MOORE_H__
#define __ADDCMUL_MOORE_H__
// 1. 切换到 Moore 平台的 Elementwise API
#include "../../../elementwise/moore/elementwise_moore_api.h"
namespace op::addcmul::moore {
/**
* 为 addcmul 在 Moore 端自定义 Descriptor
* 保持与 NVIDIA 版本一致的结构,以便于跨平台对齐
*/
class Descriptor final : public InfiniopDescriptor {
infiniDtype_t _dtype;
op::elementwise::ElementwiseInfo _info;
// 2. 切换到 Moore 设备的实现指针
std::unique_ptr<op::elementwise::moore::DeviceImpl> _device_info;
size_t _workspace_size;
float _value; // 标量系数 value
public:
// 摩尔线程 MUSA 同样支持 stride 访问,记录张量元信息
static constexpr int MAX_NDIM = 8;
struct TensorMeta {
int ndim;
size_t shape[MAX_NDIM];
ptrdiff_t strides[MAX_NDIM];
};
TensorMeta _out_meta{};
TensorMeta _input_meta{};
TensorMeta _t1_meta{};
TensorMeta _t2_meta{};
size_t _output_size{0};
Descriptor(
infiniDtype_t dtype,
op::elementwise::ElementwiseInfo info,
op::elementwise::moore::DeviceImpl *device_info, // 3. 修改构造函数参数类型
size_t workspace_size,
infiniDevice_t device_type,
int device_id)
: InfiniopDescriptor{device_type, device_id},
_dtype(dtype),
_info(std::move(info)),
_device_info(device_info),
_workspace_size(workspace_size),
_value(0.0f) {}
public:
~Descriptor();
size_t workspaceSize() const { return _workspace_size; }
// 4. 保持相同的接口,接收 value 参数
static infiniStatus_t create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t output_desc,
std::vector<infiniopTensorDescriptor_t> input_descs,
float value);
infiniStatus_t calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const;
float getValue() const { return _value; }
};
} // namespace op::addcmul::moore
#endif // __ADDCMUL_MOORE_H__
#include "../../../elementwise/moore/elementwise_moore.h"
#include "addcmul_moore.h"
#include "addcmul_moore_kernel.h"
#include <musa_runtime.h>
namespace op::addcmul::moore {
Descriptor::~Descriptor() = default;
// 1. 填充 TensorMeta,逻辑与 NVIDIA 一致,用于 MUSA Kernel 中的 Strided 寻址
static inline infiniStatus_t fill_tensor_meta(
infiniopTensorDescriptor_t desc,
Descriptor::TensorMeta &meta) {
auto ndim = desc->ndim();
if (ndim > Descriptor::MAX_NDIM) {
return INFINI_STATUS_NOT_IMPLEMENTED;
}
meta.ndim = static_cast<int>(ndim);
const auto &shape = desc->shape();
const auto &strides = desc->strides();
for (int i = 0; i < meta.ndim; ++i) {
meta.shape[i] = shape[i];
meta.strides[i] = strides[i];
}
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec,
float value) {
auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
auto dtype = out_desc->dtype();
// 类型检查
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64);
// 形状检查 (A, T1, T2 需一致)
const auto &out_shape = out_desc->shape();
const auto &input_desc = input_desc_vec.at(0);
const auto &t1_desc = input_desc_vec.at(1);
const auto &t2_desc = input_desc_vec.at(2);
CHECK_SAME_SHAPE(out_shape, input_desc->shape());
CHECK_SAME_SHAPE(out_shape, t1_desc->shape());
CHECK_SAME_SHAPE(out_shape, t2_desc->shape());
// 2. 调用 Moore 平台的描述符创建宏
CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
auto *desc = *desc_ptr;
desc->_output_size = out_desc->numel();
// 填充元数据
CHECK_STATUS(fill_tensor_meta(out_desc, desc->_out_meta));
CHECK_STATUS(fill_tensor_meta(input_desc, desc->_input_meta));
CHECK_STATUS(fill_tensor_meta(t1_desc, desc->_t1_meta));
CHECK_STATUS(fill_tensor_meta(t2_desc, desc->_t2_meta));
desc->_value = value;
return INFINI_STATUS_SUCCESS;
}
// 3. MUSA Kernel 实现:逻辑保持一致
template <typename T>
__global__ void addcmul_kernel(
size_t output_size,
Descriptor::TensorMeta out_meta,
Descriptor::TensorMeta in_meta,
Descriptor::TensorMeta t1_meta,
Descriptor::TensorMeta t2_meta,
T *out,
const T *input,
const T *t1,
const T *t2,
float value) {
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= output_size) {
return;
}
ptrdiff_t out_offset = 0, in_offset = 0, t1_offset = 0, t2_offset = 0;
size_t linear = idx;
// 通用多维索引转偏移逻辑
for (int dim = out_meta.ndim - 1; dim >= 0; --dim) {
size_t dim_size = out_meta.shape[dim];
size_t coord = linear % dim_size;
linear /= dim_size;
out_offset += static_cast<ptrdiff_t>(coord) * out_meta.strides[dim];
in_offset += static_cast<ptrdiff_t>(coord) * in_meta.strides[dim];
t1_offset += static_cast<ptrdiff_t>(coord) * t1_meta.strides[dim];
t2_offset += static_cast<ptrdiff_t>(coord) * t2_meta.strides[dim];
}
// 调用 Moore 平台定义的 AddcmulOp
out[out_offset] = op::addcmul::moore::AddcmulOp{}(input[in_offset], t1[t1_offset], t2[t2_offset], value);
}
// 4. 内核启动封装
template <typename T>
static inline infiniStatus_t launch_addcmul_kernel(
const Descriptor *desc,
void *output,
const std::vector<const void *> &inputs,
void *stream) {
size_t output_size = desc->_output_size;
if (output_size == 0) {
return INFINI_STATUS_SUCCESS;
}
auto *out_ptr = reinterpret_cast<T *>(output);
auto *in_ptr = reinterpret_cast<const T *>(inputs.at(0));
auto *t1_ptr = reinterpret_cast<const T *>(inputs.at(1));
auto *t2_ptr = reinterpret_cast<const T *>(inputs.at(2));
musaStream_t musa_stream = reinterpret_cast<musaStream_t>(stream);
constexpr uint32_t BLOCK_SIZE = 256;
uint32_t grid = static_cast<uint32_t>((output_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
addcmul_kernel<T><<<grid, BLOCK_SIZE, 0, musa_stream>>>(
output_size, desc->_out_meta, desc->_input_meta, desc->_t1_meta, desc->_t2_meta,
out_ptr, in_ptr, t1_ptr, t2_ptr, desc->getValue());
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return launch_addcmul_kernel<half>(this, output, inputs, stream);
case INFINI_DTYPE_BF16:
// 使用 Moore 平台对应的 bf16 类型
return launch_addcmul_kernel<cuda_bfloat16>(this, output, inputs, stream);
case INFINI_DTYPE_F32:
return launch_addcmul_kernel<float>(this, output, inputs, stream);
case INFINI_DTYPE_F64:
return launch_addcmul_kernel<double>(this, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
} // namespace op::addcmul::moore
#ifndef __ADDCMUL_MOORE_KERNEL_H__
#define __ADDCMUL_MOORE_KERNEL_H__
/*
* This file contains the Addcmul operation implementation for the MUSA backend.
* Formula: out = input + value * tensor1 * tensor2
*/
namespace op::addcmul::moore {
typedef struct AddcmulOp {
public:
// 三元算子,输入为 input, tensor1, tensor2
static constexpr size_t num_inputs = 3;
template <typename T>
__device__ __forceinline__ T operator()(const T &in, const T &t1, const T &t2, float value) const {
if constexpr (std::is_same_v<T, float>) {
// F32 直接使用乘加指令
return in + value * t1 * t2;
} else if constexpr (std::is_same_v<T, half>) {
// F16 提升到 float 计算以防止中间乘法溢出
float f_in = __half2float(in);
float f_t1 = __half2float(t1);
float f_t2 = __half2float(t2);
return __float2half(f_in + value * f_t1 * f_t2);
} else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
// BF16 同样提升到 float 计算
float f_in = __bfloat162float(in);
float f_t1 = __bfloat162float(t1);
float f_t2 = __bfloat162float(t2);
return __float2bfloat16_rn(f_in + value * f_t1 * f_t2);
} else if constexpr (std::is_same_v<T, double>) {
return in + (double)value * t1 * t2;
} else {
// 整数类型或其他类型
return in + static_cast<T>(value) * t1 * t2;
}
}
} AddcmulOp;
} // namespace op::addcmul::moore
#endif // __ADDCMUL_MOORE_KERNEL_H__
#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
#include "../cuda/kernel.cuh"
#include "addcmul_nvidia.cuh"
namespace op::addcmul::nvidia {
Descriptor::~Descriptor() = default;
// 将 TensorDescriptor 中的 shape/strides 填充到 TensorMeta 结构中
static inline infiniStatus_t fill_tensor_meta(
infiniopTensorDescriptor_t desc,
Descriptor::TensorMeta &meta) {
auto ndim = desc->ndim();
if (ndim > Descriptor::MAX_NDIM) {
return INFINI_STATUS_NOT_IMPLEMENTED;
}
meta.ndim = static_cast<int>(ndim);
const auto &shape = desc->shape();
const auto &strides = desc->strides();
for (int i = 0; i < meta.ndim; ++i) {
meta.shape[i] = shape[i];
meta.strides[i] = strides[i];
}
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec,
float value) {
auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
auto dtype = out_desc->dtype();
// 1. 类型检查
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64);
// 2. 形状检查:要求输出与三个输入形状一致(若不支持广播)
const auto &out_shape = out_desc->shape();
const auto &input_desc = input_desc_vec.at(0);
const auto &t1_desc = input_desc_vec.at(1);
const auto &t2_desc = input_desc_vec.at(2);
CHECK_SAME_SHAPE(out_shape, input_desc->shape());
CHECK_SAME_SHAPE(out_shape, t1_desc->shape());
CHECK_SAME_SHAPE(out_shape, t2_desc->shape());
// 3. 创建底层的 Elementwise CUDA 描述符
CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
// 4. 记录张量元信息和输出元素个数,供自定义 CUDA kernel 使用
auto *desc = *desc_ptr;
desc->_output_size = out_desc->numel();
CHECK_STATUS(fill_tensor_meta(out_desc, desc->_out_meta));
CHECK_STATUS(fill_tensor_meta(input_desc, desc->_input_meta));
CHECK_STATUS(fill_tensor_meta(t1_desc, desc->_t1_meta));
CHECK_STATUS(fill_tensor_meta(t2_desc, desc->_t2_meta));
// 5. 将标量属性 value 存入 Descriptor 内部
desc->_value = value;
return INFINI_STATUS_SUCCESS;
}
// 自定义 addcmul CUDA kernel:使用 Descriptor 中的 TensorMeta 做通用 strided 访问
template <typename T>
INFINIOP_CUDA_KERNEL addcmul_kernel(
size_t output_size,
Descriptor::TensorMeta out_meta,
Descriptor::TensorMeta in_meta,
Descriptor::TensorMeta t1_meta,
Descriptor::TensorMeta t2_meta,
T *out,
const T *input,
const T *t1,
const T *t2,
float value) {
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= output_size) {
return;
}
// 根据输出 shape/stride 计算各个张量的偏移
ptrdiff_t out_offset = 0;
ptrdiff_t in_offset = 0;
ptrdiff_t t1_offset = 0;
ptrdiff_t t2_offset = 0;
size_t linear = idx;
for (int dim = out_meta.ndim - 1; dim >= 0; --dim) {
size_t dim_size = out_meta.shape[dim];
size_t coord = linear % dim_size;
linear /= dim_size;
out_offset += static_cast<ptrdiff_t>(coord) * out_meta.strides[dim];
in_offset += static_cast<ptrdiff_t>(coord) * in_meta.strides[dim];
t1_offset += static_cast<ptrdiff_t>(coord) * t1_meta.strides[dim];
t2_offset += static_cast<ptrdiff_t>(coord) * t2_meta.strides[dim];
}
T in_val = input[in_offset];
T t1_val = t1[t1_offset];
T t2_val = t2[t2_offset];
out[out_offset] = op::addcmul::cuda::AddcmulOp{}(in_val, t1_val, t2_val, value);
}
template <typename T>
static inline infiniStatus_t launch_addcmul_kernel(
const Descriptor *desc,
void *output,
const std::vector<const void *> &inputs,
void *stream) {
size_t output_size = desc->_output_size;
if (output_size == 0) {
return INFINI_STATUS_SUCCESS;
}
auto *out_ptr = reinterpret_cast<T *>(output);
auto *in_ptr = reinterpret_cast<const T *>(inputs.at(0));
auto *t1_ptr = reinterpret_cast<const T *>(inputs.at(1));
auto *t2_ptr = reinterpret_cast<const T *>(inputs.at(2));
cudaStream_t cuda_stream = reinterpret_cast<cudaStream_t>(stream);
constexpr uint32_t BLOCK_SIZE = 256;
uint32_t grid = static_cast<uint32_t>((output_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
addcmul_kernel<T><<<grid, BLOCK_SIZE, 0, cuda_stream>>>(
output_size,
desc->_out_meta,
desc->_input_meta,
desc->_t1_meta,
desc->_t2_meta,
out_ptr,
in_ptr,
t1_ptr,
t2_ptr,
desc->getValue());
CHECK_CUDA(cudaGetLastError());
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
// 目前不依赖 workspace 内容,只检查大小是否足够以保持与其他算子一致的接口语义
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
// 直接调用自定义 CUDA kernel,避免通过通用 elementwise 框架
switch (_dtype) {
case INFINI_DTYPE_F16:
return launch_addcmul_kernel<half>(this, output, inputs, stream);
case INFINI_DTYPE_BF16:
return launch_addcmul_kernel<nv_bfloat16>(this, output, inputs, stream);
case INFINI_DTYPE_F32:
return launch_addcmul_kernel<float>(this, output, inputs, stream);
case INFINI_DTYPE_F64:
return launch_addcmul_kernel<double>(this, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
} // namespace op::addcmul::nvidia
#ifndef __ADDCMUL_NVIDIA_H__
#define __ADDCMUL_NVIDIA_H__
#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
namespace op::addcmul::nvidia {
// 为 addcmul 在 NVIDIA 端自定义 Descriptor,支持额外的标量参数 value
class Descriptor final : public InfiniopDescriptor {
// 为保持与通用 Elementwise 框架的兼容,仍然保留这些成员
infiniDtype_t _dtype;
op::elementwise::ElementwiseInfo _info;
std::unique_ptr<op::elementwise::nvidia::DeviceImpl> _device_info;
size_t _workspace_size;
float _value; // 标量系数 value
public:
// 为自定义 CUDA kernel 记录张量元信息
static constexpr int MAX_NDIM = 8;
struct TensorMeta {
int ndim;
size_t shape[MAX_NDIM];
ptrdiff_t strides[MAX_NDIM];
};
TensorMeta _out_meta{};
TensorMeta _input_meta{};
TensorMeta _t1_meta{};
TensorMeta _t2_meta{};
size_t _output_size{0};
Descriptor(
infiniDtype_t dtype,
op::elementwise::ElementwiseInfo info,
op::elementwise::nvidia::DeviceImpl *device_info,
size_t workspace_size,
infiniDevice_t device_type,
int device_id)
: InfiniopDescriptor{device_type, device_id},
_dtype(dtype),
_info(std::move(info)),
_device_info(device_info),
_workspace_size(workspace_size),
_value(0.0f) {}
public:
~Descriptor();
size_t workspaceSize() const { return _workspace_size; }
// 额外接收 value 参数
static infiniStatus_t create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t output_desc,
std::vector<infiniopTensorDescriptor_t> input_descs,
float value);
infiniStatus_t calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const;
float getValue() const { return _value; }
};
} // namespace op::addcmul::nvidia
#endif // __ADDCMUL_NVIDIA_H__
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/addcmul.h"
#ifdef ENABLE_CPU_API
#include "cpu/addcmul_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
#include "nvidia/addcmul_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
#include "metax/addcmul_metax.h"
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/addcmul_kunlun.h"
#endif
#ifdef ENABLE_CAMBRICON_API
#include "bang/addcmul_bang.h"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/addcmul_moore.h"
#endif
__INFINI_C infiniStatus_t infiniopCreateAddcmulDescriptor(
infiniopHandle_t handle,
infiniopAddcmulDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t out_desc,
infiniopTensorDescriptor_t input_desc,
infiniopTensorDescriptor_t t1_desc,
infiniopTensorDescriptor_t t2_desc,
float value) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::addcmul::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::addcmul::NAMESPACE::Descriptor **>(desc_ptr), \
out_desc, \
{input_desc, t1_desc, t2_desc}, \
value)
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
CREATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_CAMBRICON_API
CREATE(INFINI_DEVICE_CAMBRICON, bang);
#endif
#ifdef ENABLE_MOORE_API
CREATE(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CREATE
}
__INFINI_C infiniStatus_t infiniopGetAddcmulWorkspaceSize(infiniopAddcmulDescriptor_t desc, size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::addcmul::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
GET(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_CAMBRICON_API
GET(INFINI_DEVICE_CAMBRICON, bang);
#endif
#ifdef ENABLE_MOORE_API
GET(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef GET
}
__INFINI_C infiniStatus_t infiniopAddcmul(
infiniopAddcmulDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *out,
const void *input,
const void *t1,
const void *t2,
void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::addcmul::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, out, {input, t1, t2}, stream)
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_CAMBRICON_API
CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CALCULATE
}
__INFINI_C infiniStatus_t infiniopDestroyAddcmulDescriptor(infiniopAddcmulDescriptor_t desc) {
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::addcmul::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
DELETE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
DELETE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_CAMBRICON_API
DELETE(INFINI_DEVICE_CAMBRICON, bang);
#endif
#ifdef ENABLE_MOORE_API
DELETE(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef DELETE
}
#include "atanh_cpu.h"
namespace op::atanh::cpu {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &a_desc = input_desc_vec.at(0);
const auto &y_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(y_shape, a_shape);
CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
// 分发到对应的数据类型进行计算,模板参数为我们在 atanh_cpu.h 中定义的 AtanhOp
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<AtanhOp, fp16_t>(_info, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<AtanhOp, float>(_info, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<AtanhOp, double>(_info, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<AtanhOp, bf16_t>(_info, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::atanh::cpu
#ifndef __ATANH_CPU_H__
#define __ATANH_CPU_H__
#include "../../../elementwise/cpu/elementwise_cpu.h"
#include <cmath>
#include <type_traits>
// 注册 atanh 算子在 cpu 后端的 descriptor
ELEMENTWISE_DESCRIPTOR(atanh, cpu)
namespace op::atanh::cpu {
typedef struct AtanhOp {
public:
// atanh 是一元算子
static constexpr size_t num_inputs = 1;
template <typename T>
T operator()(const T &a) const {
// 对于 float, double 等原生支持的类型直接调用 std::atanh
if constexpr (std::is_floating_point_v<T>) {
return std::atanh(a);
} else {
// 对于 half, bfloat16 等自定义类型,先转为 float 计算再转回
// 假设这些类型支持 static_cast 到 float
return static_cast<T>(std::atanhf(static_cast<float>(a)));
}
}
} AtanhOp;
} // namespace op::atanh::cpu
#endif // __ATANH_CPU_H__
#ifndef __ATANH_CUDA_H__
#define __ATANH_CUDA_H__
#include <cuda_bf16.h>
#include <cuda_fp16.h>
namespace op::atanh::cuda {
typedef struct AtanhOp {
public:
// atanh 是一元算子,只需要一个输入
static constexpr size_t num_inputs = 1;
template <typename T>
__device__ __forceinline__ T operator()(const T &a) const {
if constexpr (std::is_same_v<T, half2>) {
// 对 half2 的两个部分分别求 atanh
float2 f = __half22float2(a);
f.x = atanhf(f.x);
f.y = atanhf(f.y);
return __float22half2_rn(f);
} else if constexpr (std::is_same_v<T, half>) {
// half 类型先转为 float 计算再转回
return __float2half(atanhf(__half2float(a)));
} else if constexpr (std::is_same_v<T, nv_bfloat16>) {
// bfloat16 类型处理同上
return __float2bfloat16(atanhf(__bfloat162float(a)));
} else if constexpr (std::is_same_v<T, float>) {
// float 直接调用标准数学库函数
return atanhf(a);
} else if constexpr (std::is_same_v<T, double>) {
return ::atanh(a);
} else {
// 其他整数类型或不支持类型理论上不应进入,此处做简单 fallback
return static_cast<T>(atanhf(static_cast<float>(a)));
}
}
} AtanhOp;
} // namespace op::atanh::cuda
#endif // __ATANH_CUDA_H__
#ifndef __ATANH_METAX_API_H__
#define __ATANH_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR(atanh, metax)
#endif // __ATANH_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax.h"
#include "atanh_metax.h"
#include "atanh_metax_kernel.h"
namespace op::atanh::metax {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &a_desc = input_desc_vec.at(0);
const auto &y_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64);
CHECK_SAME_SHAPE(y_shape, a_shape);
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, metax::AtanhOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, metax::AtanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, metax::AtanhOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, metax::AtanhOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::atanh::metax
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment