Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
21c6af2d
Unverified
Commit
21c6af2d
authored
Mar 11, 2026
by
thatPepe
Committed by
GitHub
Mar 11, 2026
Browse files
Merge pull request #1069 from InfiniTensor/issue/1031_T1_1_15
【算子比赛2025秋】T1-1-15
parents
99a802dd
5f329d7a
Changes
112
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1784 additions
and
0 deletions
+1784
-0
src/infiniop-test/src/ops/binary_cross_entropy_with_logits.cpp
...nfiniop-test/src/ops/binary_cross_entropy_with_logits.cpp
+153
-0
src/infiniop-test/src/ops/cdist.cpp
src/infiniop-test/src/ops/cdist.cpp
+132
-0
src/infiniop-test/src/ops/reciprocal.cpp
src/infiniop-test/src/ops/reciprocal.cpp
+104
-0
src/infiniop/ops/addcmul/cpu/addcmul_cpu.cc
src/infiniop/ops/addcmul/cpu/addcmul_cpu.cc
+61
-0
src/infiniop/ops/addcmul/cpu/addcmul_cpu.h
src/infiniop/ops/addcmul/cpu/addcmul_cpu.h
+78
-0
src/infiniop/ops/addcmul/cuda/kernel.cuh
src/infiniop/ops/addcmul/cuda/kernel.cuh
+46
-0
src/infiniop/ops/addcmul/metax/addcmul_metax.h
src/infiniop/ops/addcmul/metax/addcmul_metax.h
+72
-0
src/infiniop/ops/addcmul/metax/addcmul_metax.maca
src/infiniop/ops/addcmul/metax/addcmul_metax.maca
+179
-0
src/infiniop/ops/addcmul/metax/addcmul_metax_kernel.h
src/infiniop/ops/addcmul/metax/addcmul_metax_kernel.h
+44
-0
src/infiniop/ops/addcmul/moore/addcmul_moore.h
src/infiniop/ops/addcmul/moore/addcmul_moore.h
+76
-0
src/infiniop/ops/addcmul/moore/addcmul_moore.mu
src/infiniop/ops/addcmul/moore/addcmul_moore.mu
+163
-0
src/infiniop/ops/addcmul/moore/addcmul_moore_kernel.h
src/infiniop/ops/addcmul/moore/addcmul_moore_kernel.h
+44
-0
src/infiniop/ops/addcmul/nvidia/addcmul_nvidia.cu
src/infiniop/ops/addcmul/nvidia/addcmul_nvidia.cu
+178
-0
src/infiniop/ops/addcmul/nvidia/addcmul_nvidia.cuh
src/infiniop/ops/addcmul/nvidia/addcmul_nvidia.cuh
+72
-0
src/infiniop/ops/addcmul/operator.cc
src/infiniop/ops/addcmul/operator.cc
+193
-0
src/infiniop/ops/atanh/cpu/atanh_cpu.cc
src/infiniop/ops/atanh/cpu/atanh_cpu.cc
+52
-0
src/infiniop/ops/atanh/cpu/atanh_cpu.h
src/infiniop/ops/atanh/cpu/atanh_cpu.h
+31
-0
src/infiniop/ops/atanh/cuda/kernel.cuh
src/infiniop/ops/atanh/cuda/kernel.cuh
+40
-0
src/infiniop/ops/atanh/metax/atanh_metax.h
src/infiniop/ops/atanh/metax/atanh_metax.h
+8
-0
src/infiniop/ops/atanh/metax/atanh_metax.maca
src/infiniop/ops/atanh/metax/atanh_metax.maca
+58
-0
No files found.
src/infiniop-test/src/ops/binary_cross_entropy_with_logits.cpp
0 → 100644
View file @
21c6af2d
#include "ops.hpp"
#include "utils.hpp"
#include <infinirt.h>
#include <iomanip>
#include <iostream>
namespace
infiniop_test
::
binary_cross_entropy_with_logits
{
struct
Test
::
Attributes
{
std
::
shared_ptr
<
Tensor
>
logits
;
std
::
shared_ptr
<
Tensor
>
target
;
std
::
shared_ptr
<
Tensor
>
weight
;
// 可选
std
::
shared_ptr
<
Tensor
>
pos_weight
;
// 可选
std
::
shared_ptr
<
Tensor
>
out
;
std
::
shared_ptr
<
Tensor
>
ans
;
int
reduction
;
// 0: none, 1: mean, 2: sum
};
std
::
shared_ptr
<
Test
>
Test
::
build
(
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
uint8_t
>>
attributes
,
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
Tensor
>>
tensors
,
double
rtol
,
double
atol
)
{
auto
test
=
std
::
shared_ptr
<
Test
>
(
new
Test
(
rtol
,
atol
));
test
->
_attributes
=
new
Attributes
();
// 1. 校验必要张量是否存在
if
(
tensors
.
find
(
"logits"
)
==
tensors
.
end
()
||
tensors
.
find
(
"target"
)
==
tensors
.
end
()
||
tensors
.
find
(
"out"
)
==
tensors
.
end
()
||
tensors
.
find
(
"ans"
)
==
tensors
.
end
())
{
throw
std
::
runtime_error
(
"Invalid BCE Test: Missing mandatory tensors"
);
}
// 2. 获取 reduction 属性 (默认为 1: mean)
test
->
_attributes
->
reduction
=
1
;
if
(
attributes
.
find
(
"reduction"
)
!=
attributes
.
end
())
{
test
->
_attributes
->
reduction
=
*
reinterpret_cast
<
int
*>
(
attributes
[
"reduction"
].
data
());
}
// 3. 填充张量(处理可选张量)
test
->
_attributes
->
logits
=
tensors
[
"logits"
];
test
->
_attributes
->
target
=
tensors
[
"target"
];
test
->
_attributes
->
out
=
tensors
[
"out"
];
test
->
_attributes
->
ans
=
tensors
[
"ans"
];
// 如果 tensors 中存在则赋值,否则为 nullptr
test
->
_attributes
->
weight
=
tensors
.
count
(
"weight"
)
?
tensors
[
"weight"
]
:
nullptr
;
test
->
_attributes
->
pos_weight
=
tensors
.
count
(
"pos_weight"
)
?
tensors
[
"pos_weight"
]
:
nullptr
;
return
test
;
}
std
::
shared_ptr
<
infiniop_test
::
Result
>
Test
::
run
(
infiniopHandle_t
handle
,
infiniDevice_t
device
,
int
device_id
,
size_t
warm_ups
,
size_t
iterations
)
{
infiniopBCEWithLogitsDescriptor_t
op_desc
;
// 4. 数据迁移
auto
logits
=
_attributes
->
logits
->
to
(
device
,
device_id
);
auto
target
=
_attributes
->
target
->
to
(
device
,
device_id
);
auto
out
=
_attributes
->
out
->
to
(
device
,
device_id
);
// 处理可选张量迁移
std
::
shared_ptr
<
Tensor
>
weight
=
(
_attributes
->
weight
)
?
_attributes
->
weight
->
to
(
device
,
device_id
)
:
nullptr
;
std
::
shared_ptr
<
Tensor
>
pos_weight
=
(
_attributes
->
pos_weight
)
?
_attributes
->
pos_weight
->
to
(
device
,
device_id
)
:
nullptr
;
// 5. 创建描述符 (注意处理 NULL 描述符)
auto
w_desc
=
weight
?
weight
->
desc
()
:
nullptr
;
auto
pw_desc
=
pos_weight
?
pos_weight
->
desc
()
:
nullptr
;
CHECK_OR
(
infiniopCreateBCEWithLogitsDescriptor
(
handle
,
&
op_desc
,
out
->
desc
(),
logits
->
desc
(),
target
->
desc
(),
w_desc
,
pw_desc
,
static_cast
<
infiniopReduction_t
>
(
_attributes
->
reduction
)),
return
TEST_FAILED
(
OP_CREATION_FAILED
,
"Failed to create BCE descriptor."
));
// 6. Workspace 管理
size_t
workspace_size
;
CHECK_OR
(
infiniopGetBCEWithLogitsWorkspaceSize
(
op_desc
,
&
workspace_size
),
return
TEST_FAILED
(
OP_CREATION_FAILED
,
"Failed to get workspace size."
));
void
*
workspace
;
CHECK_OR
(
infinirtMalloc
(
&
workspace
,
workspace_size
),
return
TEST_FAILED
(
OP_CREATION_FAILED
,
"Failed to allocate workspace."
));
// 7. 执行计算
auto
w_data
=
weight
?
weight
->
data
()
:
nullptr
;
auto
pw_data
=
pos_weight
?
pos_weight
->
data
()
:
nullptr
;
CHECK_OR
(
infiniopBCEWithLogits
(
op_desc
,
workspace
,
workspace_size
,
out
->
data
(),
logits
->
data
(),
target
->
data
(),
w_data
,
pw_data
,
nullptr
),
return
TEST_FAILED
(
OP_EXECUTION_FAILED
,
"Failed during execution."
));
// 8. 结果验证
try
{
allClose
(
out
,
_attributes
->
ans
,
_rtol
,
_atol
);
}
catch
(
const
std
::
exception
&
e
)
{
return
TEST_FAILED
(
RESULT_INCORRECT
,
e
.
what
());
}
// 9. 性能 Benchmark
double
elapsed_time
=
benchmark
(
[
=
]()
{
infiniopBCEWithLogits
(
op_desc
,
workspace
,
workspace_size
,
out
->
data
(),
logits
->
data
(),
target
->
data
(),
w_data
,
pw_data
,
nullptr
);
},
warm_ups
,
iterations
);
// 10. 资源清理
infinirtFree
(
workspace
);
infiniopDestroyBCEWithLogitsDescriptor
(
op_desc
);
return
TEST_PASSED
(
elapsed_time
);
}
std
::
vector
<
std
::
string
>
Test
::
attribute_names
()
{
return
{
"reduction"
};
}
std
::
vector
<
std
::
string
>
Test
::
tensor_names
()
{
return
{
"logits"
,
"target"
,
"weight"
,
"pos_weight"
,
"out"
,
"ans"
};
}
std
::
vector
<
std
::
string
>
Test
::
output_names
()
{
return
{
"out"
};
}
std
::
string
Test
::
toString
()
const
{
std
::
ostringstream
oss
;
oss
<<
op_name
()
<<
std
::
endl
;
oss
<<
"- reduction: "
<<
_attributes
->
reduction
<<
std
::
endl
;
oss
<<
"- logits: "
<<
_attributes
->
logits
->
info
()
<<
std
::
endl
;
if
(
_attributes
->
weight
)
{
oss
<<
"- weight: "
<<
_attributes
->
weight
->
info
()
<<
std
::
endl
;
}
oss
<<
"- out: "
<<
_attributes
->
out
->
info
()
<<
std
::
endl
;
oss
<<
std
::
scientific
<<
std
::
setprecision
(
2
);
oss
<<
"- rtol="
<<
_rtol
<<
", atol="
<<
_atol
<<
std
::
endl
;
return
oss
.
str
();
}
Test
::~
Test
()
{
delete
_attributes
;
}
}
// namespace infiniop_test::binary_cross_entropy_with_logits
src/infiniop-test/src/ops/cdist.cpp
0 → 100644
View file @
21c6af2d
#include "ops.hpp"
#include "utils.hpp"
#include <infinirt.h>
#include <iomanip>
#include <iostream>
namespace
infiniop_test
::
cdist
{
struct
Test
::
Attributes
{
std
::
shared_ptr
<
Tensor
>
x1
;
std
::
shared_ptr
<
Tensor
>
x2
;
std
::
shared_ptr
<
Tensor
>
out
;
std
::
shared_ptr
<
Tensor
>
ans
;
double
p
;
};
std
::
shared_ptr
<
Test
>
Test
::
build
(
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
uint8_t
>>
attributes
,
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
Tensor
>>
tensors
,
double
rtol
,
double
atol
)
{
auto
test
=
std
::
shared_ptr
<
Test
>
(
new
Test
(
rtol
,
atol
));
test
->
_attributes
=
new
Attributes
();
// 1. 校验张量是否存在 (x1, x2, out, ans)
if
(
tensors
.
find
(
"x1"
)
==
tensors
.
end
()
||
tensors
.
find
(
"x2"
)
==
tensors
.
end
()
||
tensors
.
find
(
"out"
)
==
tensors
.
end
()
||
tensors
.
find
(
"ans"
)
==
tensors
.
end
())
{
throw
std
::
runtime_error
(
"Invalid Cdist Test: Missing tensors"
);
}
// 2. 获取标量属性 p (注意 cdist 通常用 double)
test
->
_attributes
->
p
=
2.0
;
// 默认值
if
(
attributes
.
find
(
"p"
)
!=
attributes
.
end
())
{
test
->
_attributes
->
p
=
*
reinterpret_cast
<
double
*>
(
attributes
[
"p"
].
data
());
}
test
->
_attributes
->
x1
=
tensors
[
"x1"
];
test
->
_attributes
->
x2
=
tensors
[
"x2"
];
test
->
_attributes
->
out
=
tensors
[
"out"
];
test
->
_attributes
->
ans
=
tensors
[
"ans"
];
return
test
;
}
std
::
shared_ptr
<
infiniop_test
::
Result
>
Test
::
run
(
infiniopHandle_t
handle
,
infiniDevice_t
device
,
int
device_id
,
size_t
warm_ups
,
size_t
iterations
)
{
infiniopCdistDescriptor_t
op_desc
;
// 3. 数据迁移至指定设备 (M x D, N x D)
auto
x1
=
_attributes
->
x1
->
to
(
device
,
device_id
);
auto
x2
=
_attributes
->
x2
->
to
(
device
,
device_id
);
auto
out
=
_attributes
->
out
->
to
(
device
,
device_id
);
// 4. 创建算子描述符
CHECK_OR
(
infiniopCreateCdistDescriptor
(
handle
,
&
op_desc
,
out
->
desc
(),
x1
->
desc
(),
x2
->
desc
(),
_attributes
->
p
),
return
TEST_FAILED
(
OP_CREATION_FAILED
,
"Failed to create cdist descriptor."
));
// 5. Workspace 动态内存分配
size_t
workspace_size
;
CHECK_OR
(
infiniopGetCdistWorkspaceSize
(
op_desc
,
&
workspace_size
),
return
TEST_FAILED
(
OP_CREATION_FAILED
,
"Failed to get workspace size."
));
void
*
workspace
;
CHECK_OR
(
infinirtMalloc
(
&
workspace
,
workspace_size
),
return
TEST_FAILED
(
OP_CREATION_FAILED
,
"Failed to allocate workspace."
));
// 6. 执行计算 (计算 M x N 距离矩阵)
CHECK_OR
(
infiniopCdist
(
op_desc
,
workspace
,
workspace_size
,
out
->
data
(),
x1
->
data
(),
x2
->
data
(),
nullptr
),
// stream
return
TEST_FAILED
(
OP_EXECUTION_FAILED
,
"Failed during execution."
));
// 7. 结果数值验证
try
{
allClose
(
out
,
_attributes
->
ans
,
_rtol
,
_atol
);
}
catch
(
const
std
::
exception
&
e
)
{
return
TEST_FAILED
(
RESULT_INCORRECT
,
e
.
what
());
}
// 8. 性能 Benchmark
double
elapsed_time
=
benchmark
(
[
=
]()
{
infiniopCdist
(
op_desc
,
workspace
,
workspace_size
,
out
->
data
(),
x1
->
data
(),
x2
->
data
(),
nullptr
);
},
warm_ups
,
iterations
);
// 9. 资源清理
infinirtFree
(
workspace
);
infiniopDestroyCdistDescriptor
(
op_desc
);
return
TEST_PASSED
(
elapsed_time
);
}
std
::
vector
<
std
::
string
>
Test
::
attribute_names
()
{
return
{
"p"
};
}
std
::
vector
<
std
::
string
>
Test
::
tensor_names
()
{
return
{
"x1"
,
"x2"
,
"out"
,
"ans"
};
}
std
::
vector
<
std
::
string
>
Test
::
output_names
()
{
return
{
"out"
};
}
std
::
string
Test
::
toString
()
const
{
std
::
ostringstream
oss
;
oss
<<
op_name
()
<<
std
::
endl
;
oss
<<
"- p: "
<<
_attributes
->
p
<<
std
::
endl
;
oss
<<
"- x1: "
<<
_attributes
->
x1
->
info
()
<<
std
::
endl
;
oss
<<
"- x2: "
<<
_attributes
->
x2
->
info
()
<<
std
::
endl
;
oss
<<
"- out: "
<<
_attributes
->
out
->
info
()
<<
std
::
endl
;
oss
<<
std
::
scientific
<<
std
::
setprecision
(
2
);
oss
<<
"- rtol="
<<
_rtol
<<
", atol="
<<
_atol
<<
std
::
endl
;
return
oss
.
str
();
}
Test
::~
Test
()
{
delete
_attributes
;
}
}
// namespace infiniop_test::cdist
src/infiniop-test/src/ops/reciprocal.cpp
0 → 100644
View file @
21c6af2d
#include "ops.hpp"
#include "utils.hpp"
#include <infinirt.h>
#include <iomanip>
#include <iostream>
namespace
infiniop_test
::
reciprocal
{
struct
Test
::
Attributes
{
std
::
shared_ptr
<
Tensor
>
x
;
std
::
shared_ptr
<
Tensor
>
y
;
std
::
shared_ptr
<
Tensor
>
ans
;
};
std
::
shared_ptr
<
Test
>
Test
::
build
(
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
uint8_t
>>
attributes
,
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
Tensor
>>
tensors
,
double
rtol
,
double
atol
)
{
auto
test
=
std
::
shared_ptr
<
Test
>
(
new
Test
(
rtol
,
atol
));
test
->
_attributes
=
new
Attributes
();
if
(
tensors
.
find
(
"x"
)
==
tensors
.
end
()
||
tensors
.
find
(
"y"
)
==
tensors
.
end
()
||
tensors
.
find
(
"ans"
)
==
tensors
.
end
())
{
throw
std
::
runtime_error
(
"Invalid Test"
);
}
test
->
_attributes
->
x
=
tensors
[
"x"
];
test
->
_attributes
->
y
=
tensors
[
"y"
];
test
->
_attributes
->
ans
=
tensors
[
"ans"
];
return
test
;
}
std
::
shared_ptr
<
infiniop_test
::
Result
>
Test
::
run
(
infiniopHandle_t
handle
,
infiniDevice_t
device
,
int
device_id
,
size_t
warm_ups
,
size_t
iterations
)
{
infiniopReciprocalDescriptor_t
op_desc
;
auto
x
=
_attributes
->
x
->
to
(
device
,
device_id
);
auto
y
=
_attributes
->
y
->
to
(
device
,
device_id
);
CHECK_OR
(
infiniopCreateReciprocalDescriptor
(
handle
,
&
op_desc
,
y
->
desc
(),
x
->
desc
()),
return
TEST_FAILED
(
OP_CREATION_FAILED
,
"Failed to create op descriptor."
));
size_t
workspace_size
;
CHECK_OR
(
infiniopGetReciprocalWorkspaceSize
(
op_desc
,
&
workspace_size
),
return
TEST_FAILED
(
OP_CREATION_FAILED
,
"Failed to get workspace size."
));
void
*
workspace
;
CHECK_OR
(
infinirtMalloc
(
&
workspace
,
workspace_size
),
return
TEST_FAILED
(
OP_CREATION_FAILED
,
"Failed to allocate workspace."
));
CHECK_OR
(
infiniopReciprocal
(
op_desc
,
workspace
,
workspace_size
,
y
->
data
(),
x
->
data
(),
nullptr
),
return
TEST_FAILED
(
OP_EXECUTION_FAILED
,
"Failed during execution."
));
try
{
allClose
(
y
,
_attributes
->
ans
,
_rtol
,
_atol
);
}
catch
(
const
std
::
exception
&
e
)
{
return
TEST_FAILED
(
RESULT_INCORRECT
,
e
.
what
());
}
double
elapsed_time
=
0.
;
elapsed_time
=
benchmark
(
[
=
]()
{
infiniopReciprocal
(
op_desc
,
workspace
,
workspace_size
,
y
->
data
(),
x
->
data
(),
nullptr
);
},
warm_ups
,
iterations
);
return
TEST_PASSED
(
elapsed_time
);
}
std
::
vector
<
std
::
string
>
Test
::
attribute_names
()
{
return
{};
}
std
::
vector
<
std
::
string
>
Test
::
tensor_names
()
{
return
{
"x"
,
"y"
,
"ans"
};
}
std
::
vector
<
std
::
string
>
Test
::
output_names
()
{
return
{
"y"
};
}
std
::
string
Test
::
toString
()
const
{
std
::
ostringstream
oss
;
oss
<<
op_name
()
<<
std
::
endl
;
oss
<<
"- x: "
<<
_attributes
->
x
->
info
()
<<
std
::
endl
;
oss
<<
"- y: "
<<
_attributes
->
y
->
info
()
<<
std
::
endl
;
oss
<<
std
::
scientific
<<
std
::
setprecision
(
2
);
oss
<<
"- rtol="
<<
_rtol
<<
", atol="
<<
_atol
<<
std
::
endl
;
return
oss
.
str
();
}
Test
::~
Test
()
{
delete
_attributes
;
}
}
// namespace infiniop_test::reciprocal
src/infiniop/ops/addcmul/cpu/addcmul_cpu.cc
0 → 100644
View file @
21c6af2d
#include "addcmul_cpu.h"
namespace
op
::
addcmul
::
cpu
{
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
,
float
value
)
{
// 额外接收 value 参数
auto
handle
=
reinterpret_cast
<
device
::
cpu
::
Handle
*>
(
handle_
);
auto
dtype
=
out_desc
->
dtype
();
// 1. 类型检查
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
,
INFINI_DTYPE_BF16
);
// 2. 形状检查 (仿照 atanh,这里至少检查第一个输入)
const
auto
&
y_shape
=
out_desc
->
shape
();
for
(
const
auto
&
in_desc
:
input_desc_vec
)
{
CHECK_SAME_SHAPE
(
y_shape
,
in_desc
->
shape
());
}
// 3. 使用通用的 Elementwise 宏创建描述符
// 该宏会实例化 Descriptor 并将其赋值给 *desc_ptr
CREATE_ELEMENTWISE_CPU_DESCRIPTOR
(
handle
,
dtype
,
out_desc
,
input_desc_vec
);
// 4. 将标量属性 value 存入 Descriptor 内部
(
*
desc_ptr
)
->
_value
=
value
;
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
{
// 仿照 atanh,使用 switch 分发不同数据类型
// 这里的模板参数是 AddcmulOp,它在 addcmul_cpu.h 中定义
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
return
_device_info
->
calculate
<
AddcmulOp
,
fp16_t
>
(
_info
,
output
,
inputs
,
stream
,
_value
);
case
INFINI_DTYPE_F32
:
return
_device_info
->
calculate
<
AddcmulOp
,
float
>
(
_info
,
output
,
inputs
,
stream
,
_value
);
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
AddcmulOp
,
double
>
(
_info
,
output
,
inputs
,
stream
,
_value
);
case
INFINI_DTYPE_BF16
:
return
_device_info
->
calculate
<
AddcmulOp
,
bf16_t
>
(
_info
,
output
,
inputs
,
stream
,
_value
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::addcmul::cpu
src/infiniop/ops/addcmul/cpu/addcmul_cpu.h
0 → 100644
View file @
21c6af2d
#ifndef __ADDCMUL_CPU_H__
#define __ADDCMUL_CPU_H__
#include "../../../elementwise/cpu/elementwise_cpu.h"
#include <cmath>
#include <type_traits>
namespace
op
::
addcmul
::
cpu
{
struct
AddcmulOp
{
public:
// addcmul 是三元算子: out = input + value * t1 * t2
static
constexpr
size_t
num_inputs
=
3
;
template
<
typename
T
,
typename
Scalar
>
T
operator
()(
const
T
&
input
,
const
T
&
t1
,
const
T
&
t2
,
Scalar
value
)
const
{
// 对于 float, double 等原生浮点类型
if
constexpr
(
std
::
is_floating_point_v
<
T
>
)
{
return
input
+
static_cast
<
T
>
(
value
)
*
t1
*
t2
;
}
else
{
// 对于 fp16, bf16 等类型,提升至 float 计算以保证精度并处理标量乘法
float
f_input
=
static_cast
<
float
>
(
input
);
float
f_t1
=
static_cast
<
float
>
(
t1
);
float
f_t2
=
static_cast
<
float
>
(
t2
);
float
v
=
static_cast
<
float
>
(
value
);
return
static_cast
<
T
>
(
f_input
+
v
*
f_t1
*
f_t2
);
}
}
};
// 为 addcmul 在 CPU 端自定义 Descriptor,支持额外的标量参数 value
class
Descriptor
final
:
public
InfiniopDescriptor
{
infiniDtype_t
_dtype
;
op
::
elementwise
::
ElementwiseInfo
_info
;
std
::
unique_ptr
<
op
::
elementwise
::
cpu
::
DeviceImpl
>
_device_info
;
size_t
_workspace_size
;
float
_value
;
// 标量系数 value
Descriptor
(
infiniDtype_t
dtype
,
op
::
elementwise
::
ElementwiseInfo
info
,
op
::
elementwise
::
cpu
::
DeviceImpl
*
device_info
,
size_t
workspace_size
,
infiniDevice_t
device_type
,
int
device_id
)
:
InfiniopDescriptor
{
device_type
,
device_id
},
_dtype
(
dtype
),
_info
(
std
::
move
(
info
)),
_device_info
(
device_info
),
_workspace_size
(
workspace_size
),
_value
(
0.0
f
)
{}
public:
~
Descriptor
();
size_t
workspaceSize
()
const
{
return
_workspace_size
;
}
// 额外接收 value 参数
static
infiniStatus_t
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
output_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_descs
,
float
value
);
infiniStatus_t
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
;
float
getValue
()
const
{
return
_value
;
}
};
}
// namespace op::addcmul::cpu
#endif // __ADDCMUL_CPU_H__
src/infiniop/ops/addcmul/cuda/kernel.cuh
0 → 100644
View file @
21c6af2d
#ifndef __ADDCMUL_CUDA_CUH__
#define __ADDCMUL_CUDA_CUH__
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <type_traits>
namespace
op
::
addcmul
::
cuda
{
struct
AddcmulOp
{
public:
// addcmul 是三元算子:out = input + value * t1 * t2
static
constexpr
size_t
num_inputs
=
3
;
template
<
typename
T
>
__device__
__host__
__forceinline__
T
operator
()(
const
T
&
input
,
const
T
&
t1
,
const
T
&
t2
,
float
value
)
const
{
float
v
=
value
;
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
// 提升至 float 计算以保证精度并简化标量乘法
float
f_input
=
__half2float
(
input
);
float
f_t1
=
__half2float
(
t1
);
float
f_t2
=
__half2float
(
t2
);
return
__float2half
(
f_input
+
v
*
f_t1
*
f_t2
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
nv_bfloat16
>
)
{
float
f_input
=
__bfloat162float
(
input
);
float
f_t1
=
__bfloat162float
(
t1
);
float
f_t2
=
__bfloat162float
(
t2
);
return
__float2bfloat16
(
f_input
+
v
*
f_t1
*
f_t2
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
return
input
+
v
*
t1
*
t2
;
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
double
>
)
{
return
input
+
static_cast
<
double
>
(
v
)
*
t1
*
t2
;
}
else
{
// 兜底逻辑
return
static_cast
<
T
>
(
static_cast
<
float
>
(
input
)
+
v
*
static_cast
<
float
>
(
t1
)
*
static_cast
<
float
>
(
t2
));
}
}
};
}
// namespace op::addcmul::cuda
#endif // __ADDCMUL_CUDA_CUH__
src/infiniop/ops/addcmul/metax/addcmul_metax.h
0 → 100644
View file @
21c6af2d
#ifndef __ADDCMUL_METAX_H__
#define __ADDCMUL_METAX_H__
#include "../../../elementwise/metax/elementwise_metax_api.h"
namespace
op
::
addcmul
::
metax
{
// 为 addcmul 在 METAX 端自定义 Descriptor,支持额外的标量参数 value
class
Descriptor
final
:
public
InfiniopDescriptor
{
// 为保持与通用 Elementwise 框架的兼容,仍然保留这些成员
infiniDtype_t
_dtype
;
op
::
elementwise
::
ElementwiseInfo
_info
;
std
::
unique_ptr
<
op
::
elementwise
::
metax
::
DeviceImpl
>
_device_info
;
size_t
_workspace_size
;
float
_value
;
// 标量系数 value
public:
// 为自定义 CUDA kernel 记录张量元信息
static
constexpr
int
MAX_NDIM
=
8
;
struct
TensorMeta
{
int
ndim
;
size_t
shape
[
MAX_NDIM
];
ptrdiff_t
strides
[
MAX_NDIM
];
};
TensorMeta
_out_meta
{};
TensorMeta
_input_meta
{};
TensorMeta
_t1_meta
{};
TensorMeta
_t2_meta
{};
size_t
_output_size
{
0
};
Descriptor
(
infiniDtype_t
dtype
,
op
::
elementwise
::
ElementwiseInfo
info
,
op
::
elementwise
::
metax
::
DeviceImpl
*
device_info
,
size_t
workspace_size
,
infiniDevice_t
device_type
,
int
device_id
)
:
InfiniopDescriptor
{
device_type
,
device_id
},
_dtype
(
dtype
),
_info
(
std
::
move
(
info
)),
_device_info
(
device_info
),
_workspace_size
(
workspace_size
),
_value
(
0.0
f
)
{}
public:
~
Descriptor
();
size_t
workspaceSize
()
const
{
return
_workspace_size
;
}
// 额外接收 value 参数
static
infiniStatus_t
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
output_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_descs
,
float
value
);
infiniStatus_t
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
;
float
getValue
()
const
{
return
_value
;
}
};
}
// namespace op::addcmul::metax
#endif // __ADDCMUL_METAX_H__
src/infiniop/ops/addcmul/metax/addcmul_metax.maca
0 → 100644
View file @
21c6af2d
#include "../../../elementwise/metax/elementwise_metax.h"
#include "addcmul_metax.h"
#include "addcmul_metax_kernel.h"
namespace op::addcmul::metax {
Descriptor::~Descriptor() = default;
// 将 TensorDescriptor 中的 shape/strides 填充到 TensorMeta 结构中
static inline infiniStatus_t fill_tensor_meta(
infiniopTensorDescriptor_t desc,
Descriptor::TensorMeta &meta) {
auto ndim = desc->ndim();
if (ndim > Descriptor::MAX_NDIM) {
return INFINI_STATUS_NOT_IMPLEMENTED;
}
meta.ndim = static_cast<int>(ndim);
const auto &shape = desc->shape();
const auto &strides = desc->strides();
for (int i = 0; i < meta.ndim; ++i) {
meta.shape[i] = shape[i];
meta.strides[i] = strides[i];
}
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec,
float value) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto dtype = out_desc->dtype();
// 1. 类型检查
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64);
// 2. 形状检查:要求输出与三个输入形状一致(若不支持广播)
const auto &out_shape = out_desc->shape();
const auto &input_desc = input_desc_vec.at(0);
const auto &t1_desc = input_desc_vec.at(1);
const auto &t2_desc = input_desc_vec.at(2);
CHECK_SAME_SHAPE(out_shape, input_desc->shape());
CHECK_SAME_SHAPE(out_shape, t1_desc->shape());
CHECK_SAME_SHAPE(out_shape, t2_desc->shape());
// 3. 创建底层的 Elementwise METAX 描述符
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
// 4. 记录张量元信息和输出元素个数,供自定义 METAX kernel 使用
auto *desc = *desc_ptr;
desc->_output_size = out_desc->numel();
CHECK_STATUS(fill_tensor_meta(out_desc, desc->_out_meta));
CHECK_STATUS(fill_tensor_meta(input_desc, desc->_input_meta));
CHECK_STATUS(fill_tensor_meta(t1_desc, desc->_t1_meta));
CHECK_STATUS(fill_tensor_meta(t2_desc, desc->_t2_meta));
// 5. 将标量属性 value 存入 Descriptor 内部
desc->_value = value;
return INFINI_STATUS_SUCCESS;
}
// 自定义 addcmul METAX kernel:使用 Descriptor 中的 TensorMeta 做通用 strided 访问
template <typename T>
INFINIOP_METAX_KERNEL addcmul_kernel(
size_t output_size,
Descriptor::TensorMeta out_meta,
Descriptor::TensorMeta in_meta,
Descriptor::TensorMeta t1_meta,
Descriptor::TensorMeta t2_meta,
T *out,
const T *input,
const T *t1,
const T *t2,
float value) {
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= output_size) {
return;
}
// 根据输出 shape/stride 计算各个张量的偏移
ptrdiff_t out_offset = 0;
ptrdiff_t in_offset = 0;
ptrdiff_t t1_offset = 0;
ptrdiff_t t2_offset = 0;
size_t linear = idx;
for (int dim = out_meta.ndim - 1; dim >= 0; --dim) {
size_t dim_size = out_meta.shape[dim];
size_t coord = linear % dim_size;
linear /= dim_size;
out_offset += static_cast<ptrdiff_t>(coord) * out_meta.strides[dim];
in_offset += static_cast<ptrdiff_t>(coord) * in_meta.strides[dim];
t1_offset += static_cast<ptrdiff_t>(coord) * t1_meta.strides[dim];
t2_offset += static_cast<ptrdiff_t>(coord) * t2_meta.strides[dim];
}
T in_val = input[in_offset];
T t1_val = t1[t1_offset];
T t2_val = t2[t2_offset];
out[out_offset] = op::addcmul::metax::AddcmulOp{}(in_val, t1_val, t2_val, value);
}
template <typename T>
static inline infiniStatus_t launch_addcmul_kernel(
const Descriptor *desc,
void *output,
const std::vector<const void *> &inputs,
void *stream) {
size_t output_size = desc->_output_size;
if (output_size == 0) {
return INFINI_STATUS_SUCCESS;
}
auto *out_ptr = reinterpret_cast<T *>(output);
auto *in_ptr = reinterpret_cast<const T *>(inputs.at(0));
auto *t1_ptr = reinterpret_cast<const T *>(inputs.at(1));
auto *t2_ptr = reinterpret_cast<const T *>(inputs.at(2));
mcStream_t metax_stream = reinterpret_cast<mcStream_t>(stream);
constexpr uint32_t BLOCK_SIZE = 256;
uint32_t grid = static_cast<uint32_t>((output_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
addcmul_kernel<T><<<grid, BLOCK_SIZE, 0, metax_stream>>>(
output_size,
desc->_out_meta,
desc->_input_meta,
desc->_t1_meta,
desc->_t2_meta,
out_ptr,
in_ptr,
t1_ptr,
t2_ptr,
desc->getValue());
CHECK_METAX(mcGetLastError());
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
// 目前不依赖 workspace 内容,只检查大小是否足够以保持与其他算子一致的接口语义
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
// 直接调用自定义 METAX kernel,避免通过通用 elementwise 框架
switch (_dtype) {
case INFINI_DTYPE_F16:
return launch_addcmul_kernel<half>(this, output, inputs, stream);
case INFINI_DTYPE_BF16:
return launch_addcmul_kernel<cuda_bfloat16>(this, output, inputs, stream);
case INFINI_DTYPE_F32:
return launch_addcmul_kernel<float>(this, output, inputs, stream);
case INFINI_DTYPE_F64:
return launch_addcmul_kernel<double>(this, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
} // namespace op::addcmul::metax
src/infiniop/ops/addcmul/metax/addcmul_metax_kernel.h
0 → 100644
View file @
21c6af2d
#ifndef __ADDCMUL_METAX_KERNEL_H__
#define __ADDCMUL_METAX_KERNEL_H__
/*
* This file contains the Addcmul operation implementation for the MUSA backend.
* Formula: out = input + value * tensor1 * tensor2
*/
namespace
op
::
addcmul
::
metax
{
typedef
struct
AddcmulOp
{
public:
// 三元算子,输入为 input, tensor1, tensor2
static
constexpr
size_t
num_inputs
=
3
;
template
<
typename
T
>
__device__
__forceinline__
T
operator
()(
const
T
&
in
,
const
T
&
t1
,
const
T
&
t2
,
float
value
)
const
{
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
// F32 直接使用乘加指令
return
in
+
value
*
t1
*
t2
;
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
// F16 提升到 float 计算以防止中间乘法溢出
float
f_in
=
__half2float
(
in
);
float
f_t1
=
__half2float
(
t1
);
float
f_t2
=
__half2float
(
t2
);
return
__float2half
(
f_in
+
value
*
f_t1
*
f_t2
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
cuda_bfloat16
>
)
{
// BF16 同样提升到 float 计算
float
f_in
=
__bfloat162float
(
in
);
float
f_t1
=
__bfloat162float
(
t1
);
float
f_t2
=
__bfloat162float
(
t2
);
return
__float2bfloat16_rn
(
f_in
+
value
*
f_t1
*
f_t2
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
double
>
)
{
return
in
+
(
double
)
value
*
t1
*
t2
;
}
else
{
// 整数类型或其他类型
return
in
+
static_cast
<
T
>
(
value
)
*
t1
*
t2
;
}
}
}
AddcmulOp
;
}
// namespace op::addcmul::metax
#endif // __ADDCMUL_METAX_KERNEL_H__
src/infiniop/ops/addcmul/moore/addcmul_moore.h
0 → 100644
View file @
21c6af2d
#ifndef __ADDCMUL_MOORE_H__
#define __ADDCMUL_MOORE_H__
// 1. 切换到 Moore 平台的 Elementwise API
#include "../../../elementwise/moore/elementwise_moore_api.h"
namespace
op
::
addcmul
::
moore
{
/**
* 为 addcmul 在 Moore 端自定义 Descriptor
* 保持与 NVIDIA 版本一致的结构,以便于跨平台对齐
*/
class
Descriptor
final
:
public
InfiniopDescriptor
{
infiniDtype_t
_dtype
;
op
::
elementwise
::
ElementwiseInfo
_info
;
// 2. 切换到 Moore 设备的实现指针
std
::
unique_ptr
<
op
::
elementwise
::
moore
::
DeviceImpl
>
_device_info
;
size_t
_workspace_size
;
float
_value
;
// 标量系数 value
public:
// 摩尔线程 MUSA 同样支持 stride 访问,记录张量元信息
static
constexpr
int
MAX_NDIM
=
8
;
struct
TensorMeta
{
int
ndim
;
size_t
shape
[
MAX_NDIM
];
ptrdiff_t
strides
[
MAX_NDIM
];
};
TensorMeta
_out_meta
{};
TensorMeta
_input_meta
{};
TensorMeta
_t1_meta
{};
TensorMeta
_t2_meta
{};
size_t
_output_size
{
0
};
Descriptor
(
infiniDtype_t
dtype
,
op
::
elementwise
::
ElementwiseInfo
info
,
op
::
elementwise
::
moore
::
DeviceImpl
*
device_info
,
// 3. 修改构造函数参数类型
size_t
workspace_size
,
infiniDevice_t
device_type
,
int
device_id
)
:
InfiniopDescriptor
{
device_type
,
device_id
},
_dtype
(
dtype
),
_info
(
std
::
move
(
info
)),
_device_info
(
device_info
),
_workspace_size
(
workspace_size
),
_value
(
0.0
f
)
{}
public:
~
Descriptor
();
size_t
workspaceSize
()
const
{
return
_workspace_size
;
}
// 4. 保持相同的接口,接收 value 参数
static
infiniStatus_t
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
output_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_descs
,
float
value
);
infiniStatus_t
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
;
float
getValue
()
const
{
return
_value
;
}
};
}
// namespace op::addcmul::moore
#endif // __ADDCMUL_MOORE_H__
src/infiniop/ops/addcmul/moore/addcmul_moore.mu
0 → 100644
View file @
21c6af2d
#include "../../../elementwise/moore/elementwise_moore.h"
#include "addcmul_moore.h"
#include "addcmul_moore_kernel.h"
#include <musa_runtime.h>
namespace op::addcmul::moore {
Descriptor::~Descriptor() = default;
// 1. 填充 TensorMeta,逻辑与 NVIDIA 一致,用于 MUSA Kernel 中的 Strided 寻址
static inline infiniStatus_t fill_tensor_meta(
infiniopTensorDescriptor_t desc,
Descriptor::TensorMeta &meta) {
auto ndim = desc->ndim();
if (ndim > Descriptor::MAX_NDIM) {
return INFINI_STATUS_NOT_IMPLEMENTED;
}
meta.ndim = static_cast<int>(ndim);
const auto &shape = desc->shape();
const auto &strides = desc->strides();
for (int i = 0; i < meta.ndim; ++i) {
meta.shape[i] = shape[i];
meta.strides[i] = strides[i];
}
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec,
float value) {
auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
auto dtype = out_desc->dtype();
// 类型检查
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64);
// 形状检查 (A, T1, T2 需一致)
const auto &out_shape = out_desc->shape();
const auto &input_desc = input_desc_vec.at(0);
const auto &t1_desc = input_desc_vec.at(1);
const auto &t2_desc = input_desc_vec.at(2);
CHECK_SAME_SHAPE(out_shape, input_desc->shape());
CHECK_SAME_SHAPE(out_shape, t1_desc->shape());
CHECK_SAME_SHAPE(out_shape, t2_desc->shape());
// 2. 调用 Moore 平台的描述符创建宏
CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
auto *desc = *desc_ptr;
desc->_output_size = out_desc->numel();
// 填充元数据
CHECK_STATUS(fill_tensor_meta(out_desc, desc->_out_meta));
CHECK_STATUS(fill_tensor_meta(input_desc, desc->_input_meta));
CHECK_STATUS(fill_tensor_meta(t1_desc, desc->_t1_meta));
CHECK_STATUS(fill_tensor_meta(t2_desc, desc->_t2_meta));
desc->_value = value;
return INFINI_STATUS_SUCCESS;
}
// 3. MUSA Kernel 实现:逻辑保持一致
template <typename T>
__global__ void addcmul_kernel(
size_t output_size,
Descriptor::TensorMeta out_meta,
Descriptor::TensorMeta in_meta,
Descriptor::TensorMeta t1_meta,
Descriptor::TensorMeta t2_meta,
T *out,
const T *input,
const T *t1,
const T *t2,
float value) {
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= output_size) {
return;
}
ptrdiff_t out_offset = 0, in_offset = 0, t1_offset = 0, t2_offset = 0;
size_t linear = idx;
// 通用多维索引转偏移逻辑
for (int dim = out_meta.ndim - 1; dim >= 0; --dim) {
size_t dim_size = out_meta.shape[dim];
size_t coord = linear % dim_size;
linear /= dim_size;
out_offset += static_cast<ptrdiff_t>(coord) * out_meta.strides[dim];
in_offset += static_cast<ptrdiff_t>(coord) * in_meta.strides[dim];
t1_offset += static_cast<ptrdiff_t>(coord) * t1_meta.strides[dim];
t2_offset += static_cast<ptrdiff_t>(coord) * t2_meta.strides[dim];
}
// 调用 Moore 平台定义的 AddcmulOp
out[out_offset] = op::addcmul::moore::AddcmulOp{}(input[in_offset], t1[t1_offset], t2[t2_offset], value);
}
// 4. 内核启动封装
template <typename T>
static inline infiniStatus_t launch_addcmul_kernel(
const Descriptor *desc,
void *output,
const std::vector<const void *> &inputs,
void *stream) {
size_t output_size = desc->_output_size;
if (output_size == 0) {
return INFINI_STATUS_SUCCESS;
}
auto *out_ptr = reinterpret_cast<T *>(output);
auto *in_ptr = reinterpret_cast<const T *>(inputs.at(0));
auto *t1_ptr = reinterpret_cast<const T *>(inputs.at(1));
auto *t2_ptr = reinterpret_cast<const T *>(inputs.at(2));
musaStream_t musa_stream = reinterpret_cast<musaStream_t>(stream);
constexpr uint32_t BLOCK_SIZE = 256;
uint32_t grid = static_cast<uint32_t>((output_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
addcmul_kernel<T><<<grid, BLOCK_SIZE, 0, musa_stream>>>(
output_size, desc->_out_meta, desc->_input_meta, desc->_t1_meta, desc->_t2_meta,
out_ptr, in_ptr, t1_ptr, t2_ptr, desc->getValue());
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return launch_addcmul_kernel<half>(this, output, inputs, stream);
case INFINI_DTYPE_BF16:
// 使用 Moore 平台对应的 bf16 类型
return launch_addcmul_kernel<cuda_bfloat16>(this, output, inputs, stream);
case INFINI_DTYPE_F32:
return launch_addcmul_kernel<float>(this, output, inputs, stream);
case INFINI_DTYPE_F64:
return launch_addcmul_kernel<double>(this, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
} // namespace op::addcmul::moore
src/infiniop/ops/addcmul/moore/addcmul_moore_kernel.h
0 → 100644
View file @
21c6af2d
#ifndef __ADDCMUL_MOORE_KERNEL_H__
#define __ADDCMUL_MOORE_KERNEL_H__
/*
* This file contains the Addcmul operation implementation for the MUSA backend.
* Formula: out = input + value * tensor1 * tensor2
*/
namespace
op
::
addcmul
::
moore
{
typedef
struct
AddcmulOp
{
public:
// 三元算子,输入为 input, tensor1, tensor2
static
constexpr
size_t
num_inputs
=
3
;
template
<
typename
T
>
__device__
__forceinline__
T
operator
()(
const
T
&
in
,
const
T
&
t1
,
const
T
&
t2
,
float
value
)
const
{
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
// F32 直接使用乘加指令
return
in
+
value
*
t1
*
t2
;
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
// F16 提升到 float 计算以防止中间乘法溢出
float
f_in
=
__half2float
(
in
);
float
f_t1
=
__half2float
(
t1
);
float
f_t2
=
__half2float
(
t2
);
return
__float2half
(
f_in
+
value
*
f_t1
*
f_t2
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
cuda_bfloat16
>
)
{
// BF16 同样提升到 float 计算
float
f_in
=
__bfloat162float
(
in
);
float
f_t1
=
__bfloat162float
(
t1
);
float
f_t2
=
__bfloat162float
(
t2
);
return
__float2bfloat16_rn
(
f_in
+
value
*
f_t1
*
f_t2
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
double
>
)
{
return
in
+
(
double
)
value
*
t1
*
t2
;
}
else
{
// 整数类型或其他类型
return
in
+
static_cast
<
T
>
(
value
)
*
t1
*
t2
;
}
}
}
AddcmulOp
;
}
// namespace op::addcmul::moore
#endif // __ADDCMUL_MOORE_KERNEL_H__
src/infiniop/ops/addcmul/nvidia/addcmul_nvidia.cu
0 → 100644
View file @
21c6af2d
#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
#include "../cuda/kernel.cuh"
#include "addcmul_nvidia.cuh"
namespace
op
::
addcmul
::
nvidia
{
Descriptor
::~
Descriptor
()
=
default
;
// 将 TensorDescriptor 中的 shape/strides 填充到 TensorMeta 结构中
static
inline
infiniStatus_t
fill_tensor_meta
(
infiniopTensorDescriptor_t
desc
,
Descriptor
::
TensorMeta
&
meta
)
{
auto
ndim
=
desc
->
ndim
();
if
(
ndim
>
Descriptor
::
MAX_NDIM
)
{
return
INFINI_STATUS_NOT_IMPLEMENTED
;
}
meta
.
ndim
=
static_cast
<
int
>
(
ndim
);
const
auto
&
shape
=
desc
->
shape
();
const
auto
&
strides
=
desc
->
strides
();
for
(
int
i
=
0
;
i
<
meta
.
ndim
;
++
i
)
{
meta
.
shape
[
i
]
=
shape
[
i
];
meta
.
strides
[
i
]
=
strides
[
i
];
}
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
,
float
value
)
{
auto
handle
=
reinterpret_cast
<
device
::
nvidia
::
Handle
*>
(
handle_
);
auto
dtype
=
out_desc
->
dtype
();
// 1. 类型检查
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_BF16
,
INFINI_DTYPE_F64
);
// 2. 形状检查:要求输出与三个输入形状一致(若不支持广播)
const
auto
&
out_shape
=
out_desc
->
shape
();
const
auto
&
input_desc
=
input_desc_vec
.
at
(
0
);
const
auto
&
t1_desc
=
input_desc_vec
.
at
(
1
);
const
auto
&
t2_desc
=
input_desc_vec
.
at
(
2
);
CHECK_SAME_SHAPE
(
out_shape
,
input_desc
->
shape
());
CHECK_SAME_SHAPE
(
out_shape
,
t1_desc
->
shape
());
CHECK_SAME_SHAPE
(
out_shape
,
t2_desc
->
shape
());
// 3. 创建底层的 Elementwise CUDA 描述符
CREATE_ELEMENTWISE_CUDA_DESCRIPTOR
(
handle
,
dtype
,
out_desc
,
input_desc_vec
)
// 4. 记录张量元信息和输出元素个数,供自定义 CUDA kernel 使用
auto
*
desc
=
*
desc_ptr
;
desc
->
_output_size
=
out_desc
->
numel
();
CHECK_STATUS
(
fill_tensor_meta
(
out_desc
,
desc
->
_out_meta
));
CHECK_STATUS
(
fill_tensor_meta
(
input_desc
,
desc
->
_input_meta
));
CHECK_STATUS
(
fill_tensor_meta
(
t1_desc
,
desc
->
_t1_meta
));
CHECK_STATUS
(
fill_tensor_meta
(
t2_desc
,
desc
->
_t2_meta
));
// 5. 将标量属性 value 存入 Descriptor 内部
desc
->
_value
=
value
;
return
INFINI_STATUS_SUCCESS
;
}
// 自定义 addcmul CUDA kernel:使用 Descriptor 中的 TensorMeta 做通用 strided 访问
template
<
typename
T
>
INFINIOP_CUDA_KERNEL
addcmul_kernel
(
size_t
output_size
,
Descriptor
::
TensorMeta
out_meta
,
Descriptor
::
TensorMeta
in_meta
,
Descriptor
::
TensorMeta
t1_meta
,
Descriptor
::
TensorMeta
t2_meta
,
T
*
out
,
const
T
*
input
,
const
T
*
t1
,
const
T
*
t2
,
float
value
)
{
size_t
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
idx
>=
output_size
)
{
return
;
}
// 根据输出 shape/stride 计算各个张量的偏移
ptrdiff_t
out_offset
=
0
;
ptrdiff_t
in_offset
=
0
;
ptrdiff_t
t1_offset
=
0
;
ptrdiff_t
t2_offset
=
0
;
size_t
linear
=
idx
;
for
(
int
dim
=
out_meta
.
ndim
-
1
;
dim
>=
0
;
--
dim
)
{
size_t
dim_size
=
out_meta
.
shape
[
dim
];
size_t
coord
=
linear
%
dim_size
;
linear
/=
dim_size
;
out_offset
+=
static_cast
<
ptrdiff_t
>
(
coord
)
*
out_meta
.
strides
[
dim
];
in_offset
+=
static_cast
<
ptrdiff_t
>
(
coord
)
*
in_meta
.
strides
[
dim
];
t1_offset
+=
static_cast
<
ptrdiff_t
>
(
coord
)
*
t1_meta
.
strides
[
dim
];
t2_offset
+=
static_cast
<
ptrdiff_t
>
(
coord
)
*
t2_meta
.
strides
[
dim
];
}
T
in_val
=
input
[
in_offset
];
T
t1_val
=
t1
[
t1_offset
];
T
t2_val
=
t2
[
t2_offset
];
out
[
out_offset
]
=
op
::
addcmul
::
cuda
::
AddcmulOp
{}(
in_val
,
t1_val
,
t2_val
,
value
);
}
template
<
typename
T
>
static
inline
infiniStatus_t
launch_addcmul_kernel
(
const
Descriptor
*
desc
,
void
*
output
,
const
std
::
vector
<
const
void
*>
&
inputs
,
void
*
stream
)
{
size_t
output_size
=
desc
->
_output_size
;
if
(
output_size
==
0
)
{
return
INFINI_STATUS_SUCCESS
;
}
auto
*
out_ptr
=
reinterpret_cast
<
T
*>
(
output
);
auto
*
in_ptr
=
reinterpret_cast
<
const
T
*>
(
inputs
.
at
(
0
));
auto
*
t1_ptr
=
reinterpret_cast
<
const
T
*>
(
inputs
.
at
(
1
));
auto
*
t2_ptr
=
reinterpret_cast
<
const
T
*>
(
inputs
.
at
(
2
));
cudaStream_t
cuda_stream
=
reinterpret_cast
<
cudaStream_t
>
(
stream
);
constexpr
uint32_t
BLOCK_SIZE
=
256
;
uint32_t
grid
=
static_cast
<
uint32_t
>
((
output_size
+
BLOCK_SIZE
-
1
)
/
BLOCK_SIZE
);
addcmul_kernel
<
T
><<<
grid
,
BLOCK_SIZE
,
0
,
cuda_stream
>>>
(
output_size
,
desc
->
_out_meta
,
desc
->
_input_meta
,
desc
->
_t1_meta
,
desc
->
_t2_meta
,
out_ptr
,
in_ptr
,
t1_ptr
,
t2_ptr
,
desc
->
getValue
());
CHECK_CUDA
(
cudaGetLastError
());
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
{
// 目前不依赖 workspace 内容,只检查大小是否足够以保持与其他算子一致的接口语义
if
(
workspace_size
<
_workspace_size
)
{
return
INFINI_STATUS_INSUFFICIENT_WORKSPACE
;
}
// 直接调用自定义 CUDA kernel,避免通过通用 elementwise 框架
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
return
launch_addcmul_kernel
<
half
>
(
this
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_BF16
:
return
launch_addcmul_kernel
<
nv_bfloat16
>
(
this
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F32
:
return
launch_addcmul_kernel
<
float
>
(
this
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
return
launch_addcmul_kernel
<
double
>
(
this
,
output
,
inputs
,
stream
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
}
}
// namespace op::addcmul::nvidia
src/infiniop/ops/addcmul/nvidia/addcmul_nvidia.cuh
0 → 100644
View file @
21c6af2d
#ifndef __ADDCMUL_NVIDIA_H__
#define __ADDCMUL_NVIDIA_H__
#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
namespace
op
::
addcmul
::
nvidia
{
// 为 addcmul 在 NVIDIA 端自定义 Descriptor,支持额外的标量参数 value
class
Descriptor
final
:
public
InfiniopDescriptor
{
// 为保持与通用 Elementwise 框架的兼容,仍然保留这些成员
infiniDtype_t
_dtype
;
op
::
elementwise
::
ElementwiseInfo
_info
;
std
::
unique_ptr
<
op
::
elementwise
::
nvidia
::
DeviceImpl
>
_device_info
;
size_t
_workspace_size
;
float
_value
;
// 标量系数 value
public:
// 为自定义 CUDA kernel 记录张量元信息
static
constexpr
int
MAX_NDIM
=
8
;
struct
TensorMeta
{
int
ndim
;
size_t
shape
[
MAX_NDIM
];
ptrdiff_t
strides
[
MAX_NDIM
];
};
TensorMeta
_out_meta
{};
TensorMeta
_input_meta
{};
TensorMeta
_t1_meta
{};
TensorMeta
_t2_meta
{};
size_t
_output_size
{
0
};
Descriptor
(
infiniDtype_t
dtype
,
op
::
elementwise
::
ElementwiseInfo
info
,
op
::
elementwise
::
nvidia
::
DeviceImpl
*
device_info
,
size_t
workspace_size
,
infiniDevice_t
device_type
,
int
device_id
)
:
InfiniopDescriptor
{
device_type
,
device_id
},
_dtype
(
dtype
),
_info
(
std
::
move
(
info
)),
_device_info
(
device_info
),
_workspace_size
(
workspace_size
),
_value
(
0.0
f
)
{}
public:
~
Descriptor
();
size_t
workspaceSize
()
const
{
return
_workspace_size
;
}
// 额外接收 value 参数
static
infiniStatus_t
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
output_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_descs
,
float
value
);
infiniStatus_t
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
;
float
getValue
()
const
{
return
_value
;
}
};
}
// namespace op::addcmul::nvidia
#endif // __ADDCMUL_NVIDIA_H__
src/infiniop/ops/addcmul/operator.cc
0 → 100644
View file @
21c6af2d
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/addcmul.h"
#ifdef ENABLE_CPU_API
#include "cpu/addcmul_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
#include "nvidia/addcmul_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
#include "metax/addcmul_metax.h"
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/addcmul_kunlun.h"
#endif
#ifdef ENABLE_CAMBRICON_API
#include "bang/addcmul_bang.h"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/addcmul_moore.h"
#endif
__INFINI_C
infiniStatus_t
infiniopCreateAddcmulDescriptor
(
infiniopHandle_t
handle
,
infiniopAddcmulDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
infiniopTensorDescriptor_t
input_desc
,
infiniopTensorDescriptor_t
t1_desc
,
infiniopTensorDescriptor_t
t2_desc
,
float
value
)
{
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::addcmul::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::addcmul::NAMESPACE::Descriptor **>(desc_ptr), \
out_desc, \
{input_desc, t1_desc, t2_desc}, \
value)
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
CREATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
CREATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_CAMBRICON_API
CREATE
(
INFINI_DEVICE_CAMBRICON
,
bang
);
#endif
#ifdef ENABLE_MOORE_API
CREATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CREATE
}
__INFINI_C
infiniStatus_t
infiniopGetAddcmulWorkspaceSize
(
infiniopAddcmulDescriptor_t
desc
,
size_t
*
size
)
{
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::addcmul::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
GET
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
GET
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
GET
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_CAMBRICON_API
GET
(
INFINI_DEVICE_CAMBRICON
,
bang
);
#endif
#ifdef ENABLE_MOORE_API
GET
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef GET
}
__INFINI_C
infiniStatus_t
infiniopAddcmul
(
infiniopAddcmulDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
out
,
const
void
*
input
,
const
void
*
t1
,
const
void
*
t2
,
void
*
stream
)
{
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::addcmul::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, out, {input, t1, t2}, stream)
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
CALCULATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_CAMBRICON_API
CALCULATE
(
INFINI_DEVICE_CAMBRICON
,
bang
);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CALCULATE
}
__INFINI_C
infiniStatus_t
infiniopDestroyAddcmulDescriptor
(
infiniopAddcmulDescriptor_t
desc
)
{
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::addcmul::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
DELETE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
DELETE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_CAMBRICON_API
DELETE
(
INFINI_DEVICE_CAMBRICON
,
bang
);
#endif
#ifdef ENABLE_MOORE_API
DELETE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef DELETE
}
src/infiniop/ops/atanh/cpu/atanh_cpu.cc
0 → 100644
View file @
21c6af2d
#include "atanh_cpu.h"
namespace
op
::
atanh
::
cpu
{
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
)
{
auto
handle
=
reinterpret_cast
<
device
::
cpu
::
Handle
*>
(
handle_
);
auto
dtype
=
out_desc
->
dtype
();
const
auto
&
a_desc
=
input_desc_vec
.
at
(
0
);
const
auto
&
y_shape
=
out_desc
->
shape
();
const
auto
&
a_shape
=
a_desc
->
shape
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
,
INFINI_DTYPE_BF16
);
CHECK_SAME_SHAPE
(
y_shape
,
a_shape
);
CREATE_ELEMENTWISE_CPU_DESCRIPTOR
(
handle
,
dtype
,
out_desc
,
input_desc_vec
);
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
{
// 分发到对应的数据类型进行计算,模板参数为我们在 atanh_cpu.h 中定义的 AtanhOp
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
return
_device_info
->
calculate
<
AtanhOp
,
fp16_t
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F32
:
return
_device_info
->
calculate
<
AtanhOp
,
float
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
AtanhOp
,
double
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_BF16
:
return
_device_info
->
calculate
<
AtanhOp
,
bf16_t
>
(
_info
,
output
,
inputs
,
stream
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::atanh::cpu
src/infiniop/ops/atanh/cpu/atanh_cpu.h
0 → 100644
View file @
21c6af2d
#ifndef __ATANH_CPU_H__
#define __ATANH_CPU_H__
#include "../../../elementwise/cpu/elementwise_cpu.h"
#include <cmath>
#include <type_traits>
// 注册 atanh 算子在 cpu 后端的 descriptor
ELEMENTWISE_DESCRIPTOR
(
atanh
,
cpu
)
namespace
op
::
atanh
::
cpu
{
typedef
struct
AtanhOp
{
public:
// atanh 是一元算子
static
constexpr
size_t
num_inputs
=
1
;
template
<
typename
T
>
T
operator
()(
const
T
&
a
)
const
{
// 对于 float, double 等原生支持的类型直接调用 std::atanh
if
constexpr
(
std
::
is_floating_point_v
<
T
>
)
{
return
std
::
atanh
(
a
);
}
else
{
// 对于 half, bfloat16 等自定义类型,先转为 float 计算再转回
// 假设这些类型支持 static_cast 到 float
return
static_cast
<
T
>
(
std
::
atanhf
(
static_cast
<
float
>
(
a
)));
}
}
}
AtanhOp
;
}
// namespace op::atanh::cpu
#endif // __ATANH_CPU_H__
src/infiniop/ops/atanh/cuda/kernel.cuh
0 → 100644
View file @
21c6af2d
#ifndef __ATANH_CUDA_H__
#define __ATANH_CUDA_H__
#include <cuda_bf16.h>
#include <cuda_fp16.h>
namespace
op
::
atanh
::
cuda
{
typedef
struct
AtanhOp
{
public:
// atanh 是一元算子,只需要一个输入
static
constexpr
size_t
num_inputs
=
1
;
template
<
typename
T
>
__device__
__forceinline__
T
operator
()(
const
T
&
a
)
const
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
)
{
// 对 half2 的两个部分分别求 atanh
float2
f
=
__half22float2
(
a
);
f
.
x
=
atanhf
(
f
.
x
);
f
.
y
=
atanhf
(
f
.
y
);
return
__float22half2_rn
(
f
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
// half 类型先转为 float 计算再转回
return
__float2half
(
atanhf
(
__half2float
(
a
)));
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
nv_bfloat16
>
)
{
// bfloat16 类型处理同上
return
__float2bfloat16
(
atanhf
(
__bfloat162float
(
a
)));
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
// float 直接调用标准数学库函数
return
atanhf
(
a
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
double
>
)
{
return
::
atanh
(
a
);
}
else
{
// 其他整数类型或不支持类型理论上不应进入,此处做简单 fallback
return
static_cast
<
T
>
(
atanhf
(
static_cast
<
float
>
(
a
)));
}
}
}
AtanhOp
;
}
// namespace op::atanh::cuda
#endif // __ATANH_CUDA_H__
src/infiniop/ops/atanh/metax/atanh_metax.h
0 → 100644
View file @
21c6af2d
#ifndef __ATANH_METAX_API_H__
#define __ATANH_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR
(
atanh
,
metax
)
#endif // __ATANH_METAX_API_H__
src/infiniop/ops/atanh/metax/atanh_metax.maca
0 → 100644
View file @
21c6af2d
#include "../../../elementwise/metax/elementwise_metax.h"
#include "atanh_metax.h"
#include "atanh_metax_kernel.h"
namespace op::atanh::metax {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &a_desc = input_desc_vec.at(0);
const auto &y_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64);
CHECK_SAME_SHAPE(y_shape, a_shape);
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, metax::AtanhOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, metax::AtanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, metax::AtanhOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, metax::AtanhOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::atanh::metax
Prev
1
2
3
4
5
6
Next
yanzy
@yanzy
mentioned in commit
18773b69
·
Apr 21, 2026
mentioned in commit
18773b69
mentioned in commit 18773b69ae7bd79b4e9cf9ac0a4e4c6ed1bf9bf8
Toggle commit list
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment