Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
45a3794b
Commit
45a3794b
authored
Mar 11, 2026
by
wooway777
Browse files
issue/1031 T1-1-17
parent
cb7f0b7d
Changes
108
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1335 additions
and
0 deletions
+1335
-0
src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cuh
...nfiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cuh
+8
-0
src/infiniop/ops/cross_entropy/operator.cc
src/infiniop/ops/cross_entropy/operator.cc
+174
-0
src/infiniop/ops/equal/cpu/equal_cpu.cc
src/infiniop/ops/equal/cpu/equal_cpu.cc
+68
-0
src/infiniop/ops/equal/cpu/equal_cpu.h
src/infiniop/ops/equal/cpu/equal_cpu.h
+28
-0
src/infiniop/ops/equal/cuda/kernel.cuh
src/infiniop/ops/equal/cuda/kernel.cuh
+37
-0
src/infiniop/ops/equal/metax/equal_metax.h
src/infiniop/ops/equal/metax/equal_metax.h
+8
-0
src/infiniop/ops/equal/metax/equal_metax.maca
src/infiniop/ops/equal/metax/equal_metax.maca
+69
-0
src/infiniop/ops/equal/moore/equal_moore.h
src/infiniop/ops/equal/moore/equal_moore.h
+8
-0
src/infiniop/ops/equal/moore/equal_moore.mu
src/infiniop/ops/equal/moore/equal_moore.mu
+140
-0
src/infiniop/ops/equal/moore/equal_moore_kernel.h
src/infiniop/ops/equal/moore/equal_moore_kernel.h
+30
-0
src/infiniop/ops/equal/nvidia/equal_nvidia.cu
src/infiniop/ops/equal/nvidia/equal_nvidia.cu
+137
-0
src/infiniop/ops/equal/nvidia/equal_nvidia.cuh
src/infiniop/ops/equal/nvidia/equal_nvidia.cuh
+8
-0
src/infiniop/ops/equal/operator.cc
src/infiniop/ops/equal/operator.cc
+201
-0
src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc
src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc
+91
-0
src/infiniop/ops/hardswish/cpu/hardswish_cpu.h
src/infiniop/ops/hardswish/cpu/hardswish_cpu.h
+50
-0
src/infiniop/ops/hardswish/cuda/kernel.cuh
src/infiniop/ops/hardswish/cuda/kernel.cuh
+86
-0
src/infiniop/ops/hardswish/metax/hardswish_metax.h
src/infiniop/ops/hardswish/metax/hardswish_metax.h
+8
-0
src/infiniop/ops/hardswish/metax/hardswish_metax.maca
src/infiniop/ops/hardswish/metax/hardswish_metax.maca
+58
-0
src/infiniop/ops/hardswish/moore/hardswish_moore.h
src/infiniop/ops/hardswish/moore/hardswish_moore.h
+8
-0
src/infiniop/ops/hardswish/moore/hardswish_moore.mu
src/infiniop/ops/hardswish/moore/hardswish_moore.mu
+118
-0
No files found.
src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cuh
0 → 100644
View file @
45a3794b
#ifndef __CROSS_ENTROPY_NVIDIA_H__
#define __CROSS_ENTROPY_NVIDIA_H__
#include "../cross_entropy.h"
DESCRIPTOR
(
nvidia
)
#endif
src/infiniop/ops/cross_entropy/operator.cc
0 → 100644
View file @
45a3794b
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/cross_entropy.h"
#ifdef ENABLE_CPU_API
#include "cpu/cross_entropy_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
#include "nvidia/cross_entropy_nvidia.cuh"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/cross_entropy_moore.h"
#endif
#ifdef ENABLE_METAX_API
#include "metax/cross_entropy_metax.h"
#endif
__INFINI_C
infiniStatus_t
infiniopCreateCrossEntropyDescriptor
(
infiniopHandle_t
handle
,
infiniopCrossEntropyDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
,
infiniopTensorDescriptor_t
target_desc
)
{
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::cross_entropy::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::cross_entropy::NAMESPACE::Descriptor **>(desc_ptr), \
y_desc, x_desc, target_desc);
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
)
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
)
#endif
#ifdef ENABLE_QY_API
CREATE
(
INFINI_DEVICE_QY
,
nvidia
)
#endif
#ifdef ENABLE_HYGON_API
CREATE
(
INFINI_DEVICE_HYGON
,
nvidia
)
#endif
#ifdef ENABLE_MOORE_API
CREATE
(
INFINI_DEVICE_MOORE
,
moore
)
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
metax
)
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CREATE
}
__INFINI_C
infiniStatus_t
infiniopGetCrossEntropyWorkspaceSize
(
infiniopCrossEntropyDescriptor_t
desc
,
size_t
*
size
)
{
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::cross_entropy::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
)
#endif
#ifdef ENABLE_ILUVATAR_API
GET
(
INFINI_DEVICE_ILUVATAR
,
nvidia
)
#endif
#ifdef ENABLE_QY_API
GET
(
INFINI_DEVICE_QY
,
nvidia
)
#endif
#ifdef ENABLE_HYGON_API
GET
(
INFINI_DEVICE_HYGON
,
nvidia
)
#endif
#ifdef ENABLE_MOORE_API
GET
(
INFINI_DEVICE_MOORE
,
moore
)
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
metax
)
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef GET
}
__INFINI_C
infiniStatus_t
infiniopCrossEntropy
(
infiniopCrossEntropyDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
const
void
*
target
,
void
*
stream
)
{
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<op::cross_entropy::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, y, x, target, stream);
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
)
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
)
#endif
#ifdef ENABLE_QY_API
CALCULATE
(
INFINI_DEVICE_QY
,
nvidia
)
#endif
#ifdef ENABLE_HYGON_API
CALCULATE
(
INFINI_DEVICE_HYGON
,
nvidia
)
#endif
#ifdef ENABLE_MOORE_API
CALCULATE
(
INFINI_DEVICE_MOORE
,
moore
)
#endif
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
)
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CALCULATE
}
__INFINI_C
infiniStatus_t
infiniopDestroyCrossEntropyDescriptor
(
infiniopCrossEntropyDescriptor_t
desc
)
{
#define DESTROY(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<op::cross_entropy::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
DESTROY
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
#ifdef ENABLE_NVIDIA_API
DESTROY
(
INFINI_DEVICE_NVIDIA
,
nvidia
)
#endif
#ifdef ENABLE_ILUVATAR_API
DESTROY
(
INFINI_DEVICE_ILUVATAR
,
nvidia
)
#endif
#ifdef ENABLE_QY_API
DESTROY
(
INFINI_DEVICE_QY
,
nvidia
)
#endif
#ifdef ENABLE_HYGON_API
DESTROY
(
INFINI_DEVICE_HYGON
,
nvidia
)
#endif
#ifdef ENABLE_MOORE_API
DESTROY
(
INFINI_DEVICE_MOORE
,
moore
)
#endif
#ifdef ENABLE_METAX_API
DESTROY
(
INFINI_DEVICE_METAX
,
metax
)
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef DESTROY
}
src/infiniop/ops/equal/cpu/equal_cpu.cc
0 → 100644
View file @
45a3794b
#include <cstdint>
#include <type_traits>
#include "equal_cpu.h"
namespace
op
::
equal
::
cpu
{
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
)
{
auto
handle
=
reinterpret_cast
<
device
::
cpu
::
Handle
*>
(
handle_
);
const
auto
&
a_desc
=
input_desc_vec
.
at
(
0
);
const
auto
&
b_desc
=
input_desc_vec
.
at
(
1
);
auto
compute_dtype
=
a_desc
->
dtype
();
auto
out_dtype
=
out_desc
->
dtype
();
if
(
compute_dtype
!=
b_desc
->
dtype
())
{
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
CHECK_DTYPE
(
out_dtype
,
INFINI_DTYPE_BOOL
);
CHECK_DTYPE
(
compute_dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
,
INFINI_DTYPE_BF16
,
INFINI_DTYPE_I32
,
INFINI_DTYPE_I64
);
const
auto
&
c_shape
=
out_desc
->
shape
();
const
auto
&
a_shape
=
a_desc
->
shape
();
const
auto
&
b_shape
=
b_desc
->
shape
();
CHECK_SAME_SHAPE
(
c_shape
,
a_shape
,
b_shape
);
CREATE_ELEMENTWISE_CPU_DESCRIPTOR
(
handle
,
compute_dtype
,
out_desc
,
input_desc_vec
);
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
{
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
return
_device_info
->
calculate
<
EqualOp
,
bool
,
fp16_t
,
fp16_t
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F32
:
return
_device_info
->
calculate
<
EqualOp
,
bool
,
float
,
float
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
EqualOp
,
bool
,
double
,
double
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_BF16
:
return
_device_info
->
calculate
<
EqualOp
,
bool
,
bf16_t
,
bf16_t
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_I32
:
return
_device_info
->
calculate
<
EqualOp
,
bool
,
int32_t
,
int32_t
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_I64
:
return
_device_info
->
calculate
<
EqualOp
,
bool
,
int64_t
,
int64_t
>
(
_info
,
output
,
inputs
,
stream
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::equal::cpu
src/infiniop/ops/equal/cpu/equal_cpu.h
0 → 100644
View file @
45a3794b
#ifndef __EQUAL_CPU_H__
#define __EQUAL_CPU_H__
#include <type_traits>
#include "../../../elementwise/cpu/elementwise_cpu.h"
ELEMENTWISE_DESCRIPTOR
(
equal
,
cpu
)
namespace
op
::
equal
::
cpu
{
typedef
struct
EqualOp
{
public:
static
constexpr
size_t
num_inputs
=
2
;
template
<
typename
Tout
,
typename
Tin0
,
typename
Tin1
>
bool
operator
()(
const
Tin0
&
a
,
const
Tin1
&
b
)
{
if
constexpr
(
std
::
is_same_v
<
Tin0
,
Tin1
>
)
{
return
a
==
b
;
}
else
{
return
false
;
}
}
}
EqualOp
;
}
// namespace op::equal::cpu
#endif
src/infiniop/ops/equal/cuda/kernel.cuh
0 → 100644
View file @
45a3794b
#ifndef __EQUAL_CUDA_H__
#define __EQUAL_CUDA_H__
#if defined(__MACACC__)
#include <maca_bfloat16.h>
#include <maca_fp16.h>
#else
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#endif
#include <type_traits>
namespace
op
::
equal
::
cuda
{
typedef
struct
EqualOp
{
public:
static
constexpr
size_t
num_inputs
=
2
;
template
<
typename
Tout
,
typename
Tin0
,
typename
Tin1
>
__device__
__forceinline__
bool
operator
()(
const
Tin0
&
a
,
const
Tin1
&
b
)
const
{
if
constexpr
(
std
::
is_same_v
<
Tin0
,
Tin1
>
)
{
if
constexpr
(
std
::
is_same_v
<
Tin0
,
half2
>
)
{
static_assert
(
!
std
::
is_same_v
<
Tin0
,
half2
>
,
"half2 is not supported for mixed output dtype"
);
}
else
if
constexpr
(
std
::
is_same_v
<
Tin0
,
half
>
)
{
return
static_cast
<
Tout
>
(
__heq
(
a
,
b
));
}
else
{
return
static_cast
<
Tout
>
(
a
==
b
);
}
}
else
{
return
false
;
}
}
}
EqualOp
;
}
// namespace op::equal::cuda
#endif
src/infiniop/ops/equal/metax/equal_metax.h
0 → 100644
View file @
45a3794b
#ifndef __EQUAL_METAX_API_H__
#define __EQUAL_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR
(
equal
,
metax
)
#endif // __EQUAL_METAX_API_H__
src/infiniop/ops/equal/metax/equal_metax.maca
0 → 100644
View file @
45a3794b
#include "equal_metax.h"
#include "../../../elementwise/metax/elementwise_metax.h"
#include "../cuda/kernel.cuh"
namespace op::equal::metax {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
const auto &a_desc = input_desc_vec.at(0);
auto compute_dtype = a_desc->dtype();
auto out_dtype = out_desc->dtype();
const auto &b_desc = input_desc_vec.at(1);
const auto &c_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
const auto &b_shape = b_desc->shape();
CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16,
INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64);
CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL);
CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::EqualOp, bool, half, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::EqualOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::EqualOp, bool, float, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I32:
return _device_info->calculate<256, cuda::EqualOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I64:
return _device_info->calculate<256, cuda::EqualOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::EqualOp, bool, double, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
} // namespace op::equal::metax
src/infiniop/ops/equal/moore/equal_moore.h
0 → 100644
View file @
45a3794b
#ifndef __EQUAL_MOORE_API_H__
#define __EQUAL_MOORE_API_H__
#include "../../../elementwise/moore/elementwise_moore_api.h"
ELEMENTWISE_DESCRIPTOR
(
equal
,
moore
)
#endif // __EQUAL_MOORE_API_H__
src/infiniop/ops/equal/moore/equal_moore.mu
0 → 100644
View file @
45a3794b
#include "equal_moore.h"
#include "../../../elementwise/moore/elementwise_moore.h"
#include "equal_moore_kernel.h"
namespace op::equal::moore {
namespace {
inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) {
if (!info.isOutputContiguous()) {
return false;
}
const bool *input_contiguous = info.getInputContiguous();
const bool *input_broadcasted = info.getInputBroadcasted();
for (size_t i = 0; i < 2; ++i) {
if (!input_contiguous[i] || input_broadcasted[i]) {
return false;
}
}
return true;
}
template <typename Tout, typename Tin>
INFINIOP_MOORE_KERNEL equal_contiguous_kernel(size_t numel, Tout *output, const Tin *a, const Tin *b) {
const auto op = op::equal::moore::EqualOp{};
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
size_t stride = blockDim.x * gridDim.x;
for (; idx < numel; idx += stride) {
output[idx] = op.template operator()<Tout, Tin>(a[idx], b[idx]);
}
}
template <typename Tout, typename Tin>
infiniStatus_t launch_fast_path(size_t numel,
void *output,
const std::vector<const void *> &inputs,
void *stream) {
if (numel == 0) {
return INFINI_STATUS_SUCCESS;
}
constexpr int kBlockSize = 256;
int grid = static_cast<int>((numel + kBlockSize - 1) / kBlockSize);
if (grid > 65535) {
grid = 65535;
}
auto musa_stream = reinterpret_cast<musaStream_t>(stream);
equal_contiguous_kernel<Tout, Tin><<<grid, kBlockSize, 0, musa_stream>>>(
numel,
reinterpret_cast<Tout *>(output),
reinterpret_cast<const Tin *>(inputs[0]),
reinterpret_cast<const Tin *>(inputs[1]));
return INFINI_STATUS_SUCCESS;
}
} // namespace
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
const auto &a_desc = input_desc_vec.at(0);
auto compute_dtype = a_desc->dtype();
auto out_dtype = out_desc->dtype();
const auto &b_desc = input_desc_vec.at(1);
const auto &c_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
const auto &b_shape = b_desc->shape();
CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16,
INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64);
CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL);
CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
// create MOORE elementwise descriptor
CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (can_use_contiguous_fast_path(_info)) {
size_t numel = _info.getOutputSize();
switch (_dtype) {
case INFINI_DTYPE_F16:
return launch_fast_path<bool, half>(numel, output, inputs, stream);
case INFINI_DTYPE_BF16:
return launch_fast_path<bool, cuda_bfloat16>(numel, output, inputs, stream);
case INFINI_DTYPE_F32:
return launch_fast_path<bool, float>(numel, output, inputs, stream);
case INFINI_DTYPE_I32:
return launch_fast_path<bool, int32_t>(numel, output, inputs, stream);
case INFINI_DTYPE_I64:
return launch_fast_path<bool, int64_t>(numel, output, inputs, stream);
case INFINI_DTYPE_F64:
return launch_fast_path<bool, double>(numel, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, moore::EqualOp, bool, half, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, moore::EqualOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, moore::EqualOp, bool, float, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I32:
return _device_info->calculate<256, moore::EqualOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I64:
return _device_info->calculate<256, moore::EqualOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, moore::EqualOp, bool, double, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
} // namespace op::equal::moore
src/infiniop/ops/equal/moore/equal_moore_kernel.h
0 → 100644
View file @
45a3794b
#ifndef __EQUAL_MOORE_KERNEL_H__
#define __EQUAL_MOORE_KERNEL_H__
#include <type_traits>
namespace
op
::
equal
::
moore
{
typedef
struct
EqualOp
{
public:
static
constexpr
size_t
num_inputs
=
2
;
template
<
typename
Tout
,
typename
Tin0
,
typename
Tin1
>
__device__
__forceinline__
bool
operator
()(
const
Tin0
&
a
,
const
Tin1
&
b
)
const
{
if
constexpr
(
std
::
is_same_v
<
Tin0
,
Tin1
>
)
{
if
constexpr
(
std
::
is_same_v
<
Tin0
,
half
>
)
{
return
__half2float
(
a
)
==
__half2float
(
b
);
}
else
if
constexpr
(
std
::
is_same_v
<
Tin0
,
cuda_bfloat16
>
)
{
return
__bfloat162float
(
a
)
==
__bfloat162float
(
b
);
}
else
{
return
a
==
b
;
}
}
else
{
return
false
;
}
}
}
EqualOp
;
}
// namespace op::equal::moore
#endif // __EQUAL_MOORE_KERNEL_H__
src/infiniop/ops/equal/nvidia/equal_nvidia.cu
0 → 100644
View file @
45a3794b
#include <algorithm>
#include <cstdint>
#include <type_traits>
#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
#include "../cuda/kernel.cuh"
#include "equal_nvidia.cuh"
namespace
{
template
<
typename
Tout
,
typename
Tin
>
INFINIOP_CUDA_KERNEL
FastEqualKernel
(
size_t
n
,
Tout
*
output
,
const
Tin
*
a
,
const
Tin
*
b
)
{
size_t
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
size_t
stride
=
blockDim
.
x
*
gridDim
.
x
;
op
::
equal
::
cuda
::
EqualOp
op
{};
for
(;
idx
<
n
;
idx
+=
stride
)
{
output
[
idx
]
=
op
.
template
operator
()
<
Tout
,
Tin
>(
a
[
idx
],
b
[
idx
]);
}
}
template
<
typename
Tout
,
typename
Tin
>
infiniStatus_t
launchFastEqualKernel
(
size_t
numel
,
void
*
output
,
const
std
::
vector
<
const
void
*>
&
inputs
,
void
*
stream
)
{
if
(
numel
==
0
)
{
return
INFINI_STATUS_SUCCESS
;
}
constexpr
int
block
=
256
;
int
grid
=
static_cast
<
int
>
((
numel
+
block
-
1
)
/
block
);
grid
=
std
::
min
(
grid
,
65535
);
auto
cuda_stream
=
reinterpret_cast
<
cudaStream_t
>
(
stream
);
FastEqualKernel
<
Tout
,
Tin
><<<
grid
,
block
,
0
,
cuda_stream
>>>
(
numel
,
reinterpret_cast
<
Tout
*>
(
output
),
reinterpret_cast
<
const
Tin
*>
(
inputs
[
0
]),
reinterpret_cast
<
const
Tin
*>
(
inputs
[
1
]));
auto
err
=
cudaGetLastError
();
return
err
==
cudaSuccess
?
INFINI_STATUS_SUCCESS
:
INFINI_STATUS_INTERNAL_ERROR
;
}
}
// namespace
namespace
op
::
equal
::
nvidia
{
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
)
{
auto
handle
=
reinterpret_cast
<
device
::
nvidia
::
Handle
*>
(
handle_
);
const
auto
&
a_desc
=
input_desc_vec
.
at
(
0
);
auto
compute_dtype
=
a_desc
->
dtype
();
auto
out_dtype
=
out_desc
->
dtype
();
const
auto
&
b_desc
=
input_desc_vec
.
at
(
1
);
const
auto
&
c_shape
=
out_desc
->
shape
();
const
auto
&
a_shape
=
a_desc
->
shape
();
const
auto
&
b_shape
=
b_desc
->
shape
();
CHECK_DTYPE
(
compute_dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_BF16
,
INFINI_DTYPE_I32
,
INFINI_DTYPE_I64
,
INFINI_DTYPE_F64
);
CHECK_DTYPE
(
out_dtype
,
INFINI_DTYPE_BOOL
,
INFINI_DTYPE_U8
,
INFINI_DTYPE_I8
);
CHECK_SAME_SHAPE
(
c_shape
,
a_shape
,
b_shape
);
CREATE_ELEMENTWISE_CUDA_DESCRIPTOR
(
handle
,
compute_dtype
,
out_desc
,
input_desc_vec
)
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
{
bool
fast_path
=
_info
.
isOutputContiguous
();
if
(
fast_path
)
{
const
bool
*
input_contiguous
=
_info
.
getInputContiguous
();
const
bool
*
input_broadcasted
=
_info
.
getInputBroadcasted
();
for
(
size_t
i
=
0
;
i
<
2
;
++
i
)
{
fast_path
&=
input_contiguous
[
i
]
&&
!
input_broadcasted
[
i
];
}
}
if
(
fast_path
)
{
size_t
numel
=
_info
.
getOutputSize
();
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
return
launchFastEqualKernel
<
bool
,
half
>
(
numel
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_BF16
:
return
launchFastEqualKernel
<
bool
,
cuda_bfloat16
>
(
numel
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F32
:
return
launchFastEqualKernel
<
bool
,
float
>
(
numel
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_I32
:
return
launchFastEqualKernel
<
bool
,
int32_t
>
(
numel
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_I64
:
return
launchFastEqualKernel
<
bool
,
int64_t
>
(
numel
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
return
launchFastEqualKernel
<
bool
,
double
>
(
numel
,
output
,
inputs
,
stream
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
}
if
(
workspace_size
<
_workspace_size
)
{
return
INFINI_STATUS_INSUFFICIENT_WORKSPACE
;
}
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
return
_device_info
->
calculate
<
256
,
cuda
::
EqualOp
,
bool
,
half
,
half
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_BF16
:
return
_device_info
->
calculate
<
256
,
cuda
::
EqualOp
,
bool
,
cuda_bfloat16
,
cuda_bfloat16
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F32
:
return
_device_info
->
calculate
<
256
,
cuda
::
EqualOp
,
bool
,
float
,
float
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_I32
:
return
_device_info
->
calculate
<
256
,
cuda
::
EqualOp
,
bool
,
int32_t
,
int32_t
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_I64
:
return
_device_info
->
calculate
<
256
,
cuda
::
EqualOp
,
bool
,
int64_t
,
int64_t
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
256
,
cuda
::
EqualOp
,
bool
,
double
,
double
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::equal::nvidia
src/infiniop/ops/equal/nvidia/equal_nvidia.cuh
0 → 100644
View file @
45a3794b
#ifndef __EQUAL_CUDA_API_H__
#define __EQUAL_CUDA_API_H__
#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
ELEMENTWISE_DESCRIPTOR
(
equal
,
nvidia
)
#endif
src/infiniop/ops/equal/operator.cc
0 → 100644
View file @
45a3794b
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/equal.h"
#ifdef ENABLE_CPU_API
#include "cpu/equal_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
#include "nvidia/equal_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
#include "metax/equal_metax.h"
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/equal_kunlun.h"
#endif
#ifdef ENABLE_CAMBRICON_API
#include "bang/equal_bang.h"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/equal_moore.h"
#endif
__INFINI_C
infiniStatus_t
infiniopCreateEqualDescriptor
(
infiniopHandle_t
handle
,
infiniopEqualDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
)
{
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::equal::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::equal::NAMESPACE::Descriptor **>(desc_ptr), \
c_desc, \
{a_desc, b_desc})
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
CREATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
CREATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_CAMBRICON_API
CREATE
(
INFINI_DEVICE_CAMBRICON
,
bang
);
#endif
#ifdef ENABLE_MOORE_API
CREATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CREATE
}
__INFINI_C
infiniStatus_t
infiniopGetEqualWorkspaceSize
(
infiniopEqualDescriptor_t
desc
,
size_t
*
size
)
{
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::equal::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
GET
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
GET
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
GET
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_CAMBRICON_API
GET
(
INFINI_DEVICE_CAMBRICON
,
bang
);
#endif
#ifdef ENABLE_MOORE_API
GET
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef GET
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
__INFINI_C
infiniStatus_t
infiniopEqual
(
infiniopEqualDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
c
,
const
void
*
a
,
const
void
*
b
,
void
*
stream
)
{
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::equal::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, c, {a, b}, stream)
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
CALCULATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_CAMBRICON_API
CALCULATE
(
INFINI_DEVICE_CAMBRICON
,
bang
);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CALCULATE
}
__INFINI_C
infiniStatus_t
infiniopDestroyEqualDescriptor
(
infiniopEqualDescriptor_t
desc
)
{
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::equal::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
DELETE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
DELETE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_CAMBRICON_API
DELETE
(
INFINI_DEVICE_CAMBRICON
,
bang
);
#endif
#ifdef ENABLE_MOORE_API
DELETE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef DELETE
}
src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc
0 → 100644
View file @
45a3794b
#include "hardswish_cpu.h"
#include <cstddef>
namespace
op
::
hardswish
::
cpu
{
namespace
{
inline
bool
can_use_contiguous_fast_path
(
const
op
::
elementwise
::
ElementwiseInfo
&
info
)
{
return
info
.
isOutputContiguous
()
&&
info
.
getInputSize
()
==
1
&&
info
.
getInputContiguous
()[
0
]
&&
!
info
.
getInputBroadcasted
()[
0
];
}
template
<
typename
T
>
infiniStatus_t
launch_contiguous_cpu
(
const
op
::
elementwise
::
ElementwiseInfo
&
info
,
void
*
output
,
const
std
::
vector
<
const
void
*>
&
inputs
)
{
const
T
*
in
=
reinterpret_cast
<
const
T
*>
(
inputs
[
0
]);
T
*
out
=
reinterpret_cast
<
T
*>
(
output
);
const
ptrdiff_t
size
=
static_cast
<
ptrdiff_t
>
(
info
.
getOutputSize
());
#pragma omp parallel for if (size > 1024)
for
(
ptrdiff_t
i
=
0
;
i
<
size
;
++
i
)
{
out
[
i
]
=
HardSwishOp
{}(
in
[
i
]);
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
)
{
auto
handle
=
reinterpret_cast
<
device
::
cpu
::
Handle
*>
(
handle_
);
auto
dtype
=
out_desc
->
dtype
();
const
auto
&
input_desc
=
input_desc_vec
.
at
(
0
);
const
auto
&
output_shape
=
out_desc
->
shape
();
const
auto
&
input_shape
=
input_desc
->
shape
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_BF16
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
);
CHECK_SAME_SHAPE
(
output_shape
,
input_shape
);
CREATE_ELEMENTWISE_CPU_DESCRIPTOR
(
handle
,
dtype
,
out_desc
,
input_desc_vec
);
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
{
const
bool
fast_path
=
can_use_contiguous_fast_path
(
_info
);
if
(
fast_path
)
{
switch
(
_dtype
)
{
case
INFINI_DTYPE_BF16
:
return
launch_contiguous_cpu
<
bf16_t
>
(
_info
,
output
,
inputs
);
case
INFINI_DTYPE_F16
:
return
launch_contiguous_cpu
<
fp16_t
>
(
_info
,
output
,
inputs
);
case
INFINI_DTYPE_F32
:
return
launch_contiguous_cpu
<
float
>
(
_info
,
output
,
inputs
);
case
INFINI_DTYPE_F64
:
return
launch_contiguous_cpu
<
double
>
(
_info
,
output
,
inputs
);
default:
break
;
}
}
switch
(
_dtype
)
{
case
INFINI_DTYPE_BF16
:
return
_device_info
->
calculate
<
HardSwishOp
,
bf16_t
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F16
:
return
_device_info
->
calculate
<
HardSwishOp
,
fp16_t
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F32
:
return
_device_info
->
calculate
<
HardSwishOp
,
float
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
HardSwishOp
,
double
>
(
_info
,
output
,
inputs
,
stream
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::hardswish::cpu
src/infiniop/ops/hardswish/cpu/hardswish_cpu.h
0 → 100644
View file @
45a3794b
#ifndef __HARDSWISH_CPU_H__
#define __HARDSWISH_CPU_H__
#include "../../../elementwise/cpu/elementwise_cpu.h"
ELEMENTWISE_DESCRIPTOR
(
hardswish
,
cpu
)
#include <algorithm>
#include <cmath>
namespace
op
::
hardswish
::
cpu
{
typedef
struct
HardSwishOp
{
public:
static
constexpr
size_t
num_inputs
=
1
;
template
<
typename
T
>
T
operator
()(
const
T
&
x
)
const
{
const
float
x_f
=
utils
::
cast
<
float
>
(
x
);
const
float
clamped
=
std
::
min
(
std
::
max
(
x_f
+
3.0
f
,
0.0
f
),
6.0
f
);
const
float
result
=
x_f
*
clamped
*
(
1.0
f
/
6.0
f
);
return
utils
::
cast
<
T
>
(
result
);
}
}
HardSwishOp
;
typedef
struct
HardSwishContiguousOp
{
public:
static
constexpr
size_t
num_inputs
=
1
;
template
<
typename
T
>
T
operator
()(
const
T
&
x
)
const
{
T
three
=
static_cast
<
T
>
(
3
);
T
zero
=
static_cast
<
T
>
(
0
);
T
six
=
static_cast
<
T
>
(
6
);
T
scale
=
static_cast
<
T
>
(
0.16666667
f
);
T
val
=
x
+
three
;
val
=
std
::
max
(
zero
,
val
);
val
=
std
::
min
(
six
,
val
);
return
x
*
val
*
scale
;
}
}
HardSwishContiguousOp
;
}
// namespace op::hardswish::cpu
#endif
src/infiniop/ops/hardswish/cuda/kernel.cuh
0 → 100644
View file @
45a3794b
#ifndef __HARDSWISH_CUDA_H__
#define __HARDSWISH_CUDA_H__
#include <cmath>
#if defined(__MACACC__)
#include <maca_bfloat16.h>
#include <maca_fp16.h>
#else
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#endif
namespace
op
::
hardswish
::
cuda
{
typedef
struct
HardSwishOp
{
public:
static
constexpr
size_t
num_inputs
=
1
;
template
<
typename
T
>
__device__
__forceinline__
T
operator
()(
const
T
&
x
)
const
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
)
{
const
half2
three
=
__float2half2_rn
(
3.0
f
);
const
half2
scale
=
__float2half2_rn
(
0.16666667
f
);
half2
val
=
__hadd2
(
x
,
three
);
#if defined(ENABLE_ILUVATAR_API)
float2
val_f
=
__half22float2
(
val
);
val_f
.
x
=
fminf
(
fmaxf
(
val_f
.
x
,
0.0
f
),
6.0
f
);
val_f
.
y
=
fminf
(
fmaxf
(
val_f
.
y
,
0.0
f
),
6.0
f
);
val
=
__floats2half2_rn
(
val_f
.
x
,
val_f
.
y
);
#else
const
half2
zero
=
__float2half2_rn
(
0.0
f
);
const
half2
six
=
__float2half2_rn
(
6.0
f
);
#if __CUDA_ARCH__ >= 800
val
=
__hmin2
(
__hmax2
(
val
,
zero
),
six
);
#else
val
=
__hmax2
(
val
,
zero
);
val
=
__hmin2
(
val
,
six
);
#endif
#endif
return
__hmul2
(
__hmul2
(
x
,
val
),
scale
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
cuda_bfloat16
>
)
{
const
float
x_f
=
__bfloat162float
(
x
);
const
float
val
=
fminf
(
fmaxf
(
x_f
+
3.0
f
,
0.0
f
),
6.0
f
);
return
__float2bfloat16
(
x_f
*
val
*
0.16666667
f
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
const
float
x_f
=
__half2float
(
x
);
const
float
val
=
fminf
(
fmaxf
(
x_f
+
3.0
f
,
0.0
f
),
6.0
f
);
return
__float2half
(
x_f
*
val
*
0.16666667
f
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
const
float
val
=
fminf
(
fmaxf
(
x
+
3.0
f
,
0.0
f
),
6.0
f
);
return
x
*
val
*
0.16666667
f
;
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
double
>
)
{
const
double
val
=
fmin
(
fmax
(
x
+
3.0
,
0.0
),
6.0
);
return
x
*
val
*
(
1.0
/
6.0
);
}
}
}
HardSwishOp
;
}
// namespace op::hardswish::cuda
#endif
src/infiniop/ops/hardswish/metax/hardswish_metax.h
0 → 100644
View file @
45a3794b
#ifndef __HARDSWISH_METAX_API_H__
#define __HARDSWISH_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR
(
hardswish
,
metax
)
#endif // __HARDSWISH_METAX_API_H__
src/infiniop/ops/hardswish/metax/hardswish_metax.maca
0 → 100644
View file @
45a3794b
#include "hardswish_metax.h"
#include "../../../elementwise/metax/elementwise_metax.h"
#include "../cuda/kernel.cuh"
namespace op::hardswish::metax {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &input_desc = input_desc_vec.at(0);
const auto &output_shape = out_desc->shape();
const auto &input_shape = input_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_SAME_SHAPE(output_shape, input_shape);
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::HardSwishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::HardSwishOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::HardSwishOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::HardSwishOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
} // namespace op::hardswish::metax
src/infiniop/ops/hardswish/moore/hardswish_moore.h
0 → 100644
View file @
45a3794b
#ifndef __HARDSWISH_MOORE_API_H__
#define __HARDSWISH_MOORE_API_H__
#include "../../../elementwise/moore/elementwise_moore_api.h"
ELEMENTWISE_DESCRIPTOR
(
hardswish
,
moore
)
#endif // __HARDSWISH_MOORE_API_H__
src/infiniop/ops/hardswish/moore/hardswish_moore.mu
0 → 100644
View file @
45a3794b
#include "hardswish_moore.h"
#include "../../../elementwise/moore/elementwise_moore.h"
#include "hardswish_moore_kernel.h"
namespace op::hardswish::moore {
namespace {
inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) {
return info.isOutputContiguous() && info.getInputSize() == 1 &&
info.getInputContiguous()[0] && !info.getInputBroadcasted()[0];
}
template <typename T>
INFINIOP_MOORE_KERNEL hardswish_contiguous_kernel(size_t numel, T *out, const T *in) {
const auto op = op::hardswish::moore::HardSwishOp{};
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
size_t stride = blockDim.x * gridDim.x;
for (; idx < numel; idx += stride) {
out[idx] = op(in[idx]);
}
}
template <typename T>
infiniStatus_t launch_fast_path(size_t numel,
void *output,
const std::vector<const void *> &inputs,
void *stream) {
if (numel == 0) {
return INFINI_STATUS_SUCCESS;
}
constexpr int kBlockSize = 256;
int grid = static_cast<int>((numel + kBlockSize - 1) / kBlockSize);
if (grid > 65535) {
grid = 65535;
}
auto musa_stream = reinterpret_cast<musaStream_t>(stream);
hardswish_contiguous_kernel<T><<<grid, kBlockSize, 0, musa_stream>>>(
numel,
reinterpret_cast<T *>(output),
reinterpret_cast<const T *>(inputs[0]));
return INFINI_STATUS_SUCCESS;
}
} // namespace
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &input_desc = input_desc_vec.at(0);
const auto &output_shape = out_desc->shape();
const auto &input_shape = input_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_SAME_SHAPE(output_shape, input_shape);
// create MOORE elementwise descriptor
CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
const bool fast_path = can_use_contiguous_fast_path(_info);
if (fast_path) {
switch (_dtype) {
case INFINI_DTYPE_BF16:
return launch_fast_path<cuda_bfloat16>(_info.getOutputSize(), output, inputs, stream);
case INFINI_DTYPE_F16:
return launch_fast_path<half>(_info.getOutputSize(), output, inputs, stream);
case INFINI_DTYPE_F32:
return launch_fast_path<float>(_info.getOutputSize(), output, inputs, stream);
case INFINI_DTYPE_F64:
return launch_fast_path<double>(_info.getOutputSize(), output, inputs, stream);
default:
break;
}
}
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, moore::HardSwishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F16:
return _device_info->calculate<256, moore::HardSwishOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, moore::HardSwishOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, moore::HardSwishOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::hardswish::moore
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment