Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
93191613
Unverified
Commit
93191613
authored
Mar 13, 2026
by
thatPepe
Committed by
GitHub
Mar 13, 2026
Browse files
Merge pull request #1075 from InfiniTensor/RevertT_1-1-4
Revert T1-1-4
parents
6ab911c3
def22a08
Changes
203
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
1445 deletions
+0
-1445
src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.mu
src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.mu
+0
-129
src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cu
...infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cu
+0
-107
src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cuh
...nfiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cuh
+0
-8
src/infiniop/ops/cross_entropy/operator.cc
src/infiniop/ops/cross_entropy/operator.cc
+0
-174
src/infiniop/ops/equal/cpu/equal_cpu.cc
src/infiniop/ops/equal/cpu/equal_cpu.cc
+0
-68
src/infiniop/ops/equal/cpu/equal_cpu.h
src/infiniop/ops/equal/cpu/equal_cpu.h
+0
-28
src/infiniop/ops/equal/cuda/kernel.cuh
src/infiniop/ops/equal/cuda/kernel.cuh
+0
-37
src/infiniop/ops/equal/metax/equal_metax.h
src/infiniop/ops/equal/metax/equal_metax.h
+0
-8
src/infiniop/ops/equal/metax/equal_metax.maca
src/infiniop/ops/equal/metax/equal_metax.maca
+0
-69
src/infiniop/ops/equal/moore/equal_moore.h
src/infiniop/ops/equal/moore/equal_moore.h
+0
-8
src/infiniop/ops/equal/moore/equal_moore.mu
src/infiniop/ops/equal/moore/equal_moore.mu
+0
-140
src/infiniop/ops/equal/moore/equal_moore_kernel.h
src/infiniop/ops/equal/moore/equal_moore_kernel.h
+0
-30
src/infiniop/ops/equal/nvidia/equal_nvidia.cu
src/infiniop/ops/equal/nvidia/equal_nvidia.cu
+0
-137
src/infiniop/ops/equal/nvidia/equal_nvidia.cuh
src/infiniop/ops/equal/nvidia/equal_nvidia.cuh
+0
-8
src/infiniop/ops/equal/operator.cc
src/infiniop/ops/equal/operator.cc
+0
-201
src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc
src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc
+0
-91
src/infiniop/ops/hardswish/cpu/hardswish_cpu.h
src/infiniop/ops/hardswish/cpu/hardswish_cpu.h
+0
-50
src/infiniop/ops/hardswish/cuda/kernel.cuh
src/infiniop/ops/hardswish/cuda/kernel.cuh
+0
-86
src/infiniop/ops/hardswish/metax/hardswish_metax.h
src/infiniop/ops/hardswish/metax/hardswish_metax.h
+0
-8
src/infiniop/ops/hardswish/metax/hardswish_metax.maca
src/infiniop/ops/hardswish/metax/hardswish_metax.maca
+0
-58
No files found.
src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.mu
deleted
100644 → 0
View file @
6ab911c3
#include "../../../devices/moore/moore_common.h"
#include "cross_entropy_moore.h"
#include <cub/block/block_reduce.cuh>
#include "../../../devices/moore/moore_kernel_common.h"
#include "../../../reduce/cuda/reduce.cuh"
#include "cross_entropy_kernel.h"
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tidx, typename Tcompute>
INFINIOP_MOORE_KERNEL crossEntropy(
Tdata *y, const Tdata *x, const void *target,
size_t outer_size, size_t vocab_size, ptrdiff_t x_stride) {
crossEntropyKernel<BLOCK_SIZE, Tdata, Tidx, Tcompute>(
y, x, target, outer_size, vocab_size, x_stride);
}
namespace op::cross_entropy::moore {
struct Descriptor::Opaque {
std::shared_ptr<device::moore::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t target_desc) {
(void)y_desc;
auto x_dtype = x_desc->dtype();
auto t_dtype = target_desc->dtype();
CHECK_DTYPE(x_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
CHECK_DTYPE(t_dtype, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
CrossEntropyInfo info{};
info.dtype = x_dtype;
info.target_dtype = t_dtype;
info.vocab_size = x_desc->shape().back();
info.outer_size = target_desc->numel();
info.x_stride = static_cast<ptrdiff_t>(info.vocab_size);
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
info, 0, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <unsigned int BLOCK_SIZE>
infiniStatus_t launchKernel(void *y, const void *x, const void *target,
const CrossEntropyInfo &info, musaStream_t stream) {
dim3 grid(static_cast<uint32_t>(info.outer_size), 1, 1);
if (info.target_dtype == INFINI_DTYPE_I64) {
if (info.dtype == INFINI_DTYPE_F16) {
crossEntropy<BLOCK_SIZE, half, int64_t, float>
<<<grid, BLOCK_SIZE, 0, stream>>>(
(half *)y, (const half *)x, target,
info.outer_size, info.vocab_size, info.x_stride);
} else if (info.dtype == INFINI_DTYPE_BF16) {
crossEntropy<BLOCK_SIZE, __mt_bfloat16, int64_t, float>
<<<grid, BLOCK_SIZE, 0, stream>>>(
(__mt_bfloat16 *)y, (const __mt_bfloat16 *)x, target,
info.outer_size, info.vocab_size, info.x_stride);
} else if (info.dtype == INFINI_DTYPE_F32) {
crossEntropy<BLOCK_SIZE, float, int64_t, float>
<<<grid, BLOCK_SIZE, 0, stream>>>(
(float *)y, (const float *)x, target,
info.outer_size, info.vocab_size, info.x_stride);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
} else if (info.target_dtype == INFINI_DTYPE_I32) {
if (info.dtype == INFINI_DTYPE_F16) {
crossEntropy<BLOCK_SIZE, half, int32_t, float>
<<<grid, BLOCK_SIZE, 0, stream>>>(
(half *)y, (const half *)x, target,
info.outer_size, info.vocab_size, info.x_stride);
} else if (info.dtype == INFINI_DTYPE_BF16) {
crossEntropy<BLOCK_SIZE, __mt_bfloat16, int32_t, float>
<<<grid, BLOCK_SIZE, 0, stream>>>(
(__mt_bfloat16 *)y, (const __mt_bfloat16 *)x, target,
info.outer_size, info.vocab_size, info.x_stride);
} else if (info.dtype == INFINI_DTYPE_F32) {
crossEntropy<BLOCK_SIZE, float, int32_t, float>
<<<grid, BLOCK_SIZE, 0, stream>>>(
(float *)y, (const float *)x, target,
info.outer_size, info.vocab_size, info.x_stride);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
void *y,
const void *x,
const void *target,
void *stream_) const {
musaStream_t stream = (musaStream_t)stream_;
(void)workspace;
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) {
CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_1024>(y, x, target, _info, stream));
} else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) {
CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_512>(y, x, target, _info, stream));
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::cross_entropy::moore
src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cu
deleted
100644 → 0
View file @
6ab911c3
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include "../cuda/kernel.cuh"
#include "cross_entropy_nvidia.cuh"
template
<
unsigned
int
BLOCK_SIZE
,
typename
Tdata
,
typename
Tidx
,
typename
Tcompute
=
float
>
INFINIOP_CUDA_KERNEL
crossEntropy
(
Tdata
*
y
,
const
Tdata
*
x
,
const
void
*
target
,
size_t
outer_size
,
size_t
vocab_size
,
ptrdiff_t
x_stride
)
{
crossEntropyKernel
<
BLOCK_SIZE
,
Tdata
,
Tidx
,
Tcompute
>
(
y
,
x
,
target
,
outer_size
,
vocab_size
,
x_stride
);
}
namespace
op
::
cross_entropy
::
nvidia
{
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
nvidia
::
Handle
::
Internal
>
internal
;
};
Descriptor
::~
Descriptor
()
{
delete
_opaque
;
}
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
,
infiniopTensorDescriptor_t
target_desc
)
{
auto
x_dtype
=
x_desc
->
dtype
();
auto
t_dtype
=
target_desc
->
dtype
();
CrossEntropyInfo
info
;
info
.
dtype
=
x_dtype
;
info
.
target_dtype
=
t_dtype
;
info
.
vocab_size
=
x_desc
->
shape
().
back
();
info
.
outer_size
=
target_desc
->
numel
();
info
.
x_stride
=
static_cast
<
ptrdiff_t
>
(
info
.
vocab_size
);
auto
internal
=
reinterpret_cast
<
device
::
nvidia
::
Handle
*>
(
handle
)
->
internal
();
*
desc_ptr
=
new
Descriptor
(
new
Opaque
{
internal
},
info
,
0
,
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
template
<
unsigned
int
BLOCK_SIZE
>
infiniStatus_t
launchKernel
(
void
*
y
,
const
void
*
x
,
const
void
*
target
,
const
CrossEntropyInfo
&
info
,
cudaStream_t
stream
)
{
dim3
grid
(
static_cast
<
uint32_t
>
(
info
.
outer_size
),
1
,
1
);
if
(
info
.
target_dtype
==
INFINI_DTYPE_I64
)
{
if
(
info
.
dtype
==
INFINI_DTYPE_F16
)
{
crossEntropy
<
BLOCK_SIZE
,
half
,
int64_t
>
<<<
grid
,
BLOCK_SIZE
,
0
,
stream
>>>
((
half
*
)
y
,
(
const
half
*
)
x
,
target
,
info
.
outer_size
,
info
.
vocab_size
,
info
.
x_stride
);
}
else
if
(
info
.
dtype
==
INFINI_DTYPE_BF16
)
{
crossEntropy
<
BLOCK_SIZE
,
__nv_bfloat16
,
int64_t
>
<<<
grid
,
BLOCK_SIZE
,
0
,
stream
>>>
((
__nv_bfloat16
*
)
y
,
(
const
__nv_bfloat16
*
)
x
,
target
,
info
.
outer_size
,
info
.
vocab_size
,
info
.
x_stride
);
}
else
if
(
info
.
dtype
==
INFINI_DTYPE_F32
)
{
crossEntropy
<
BLOCK_SIZE
,
float
,
int64_t
>
<<<
grid
,
BLOCK_SIZE
,
0
,
stream
>>>
((
float
*
)
y
,
(
const
float
*
)
x
,
target
,
info
.
outer_size
,
info
.
vocab_size
,
info
.
x_stride
);
}
}
else
if
(
info
.
target_dtype
==
INFINI_DTYPE_I32
)
{
if
(
info
.
dtype
==
INFINI_DTYPE_F16
)
{
crossEntropy
<
BLOCK_SIZE
,
half
,
int32_t
>
<<<
grid
,
BLOCK_SIZE
,
0
,
stream
>>>
((
half
*
)
y
,
(
const
half
*
)
x
,
target
,
info
.
outer_size
,
info
.
vocab_size
,
info
.
x_stride
);
}
else
if
(
info
.
dtype
==
INFINI_DTYPE_BF16
)
{
crossEntropy
<
BLOCK_SIZE
,
__nv_bfloat16
,
int32_t
>
<<<
grid
,
BLOCK_SIZE
,
0
,
stream
>>>
((
__nv_bfloat16
*
)
y
,
(
const
__nv_bfloat16
*
)
x
,
target
,
info
.
outer_size
,
info
.
vocab_size
,
info
.
x_stride
);
}
else
if
(
info
.
dtype
==
INFINI_DTYPE_F32
)
{
crossEntropy
<
BLOCK_SIZE
,
float
,
int32_t
>
<<<
grid
,
BLOCK_SIZE
,
0
,
stream
>>>
((
float
*
)
y
,
(
const
float
*
)
x
,
target
,
info
.
outer_size
,
info
.
vocab_size
,
info
.
x_stride
);
}
}
else
{
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
const
void
*
target
,
void
*
stream_
)
const
{
cudaStream_t
stream
=
(
cudaStream_t
)
stream_
;
int
max_threads
=
_opaque
->
internal
->
maxThreadsPerBlock
();
if
(
max_threads
>=
1024
)
{
CHECK_STATUS
(
launchKernel
<
1024
>
(
y
,
x
,
target
,
_info
,
stream
));
}
else
if
(
max_threads
>=
512
)
{
CHECK_STATUS
(
launchKernel
<
512
>
(
y
,
x
,
target
,
_info
,
stream
));
}
else
{
CHECK_STATUS
(
launchKernel
<
256
>
(
y
,
x
,
target
,
_info
,
stream
));
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::cross_entropy::nvidia
src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cuh
deleted
100644 → 0
View file @
6ab911c3
#ifndef __CROSS_ENTROPY_NVIDIA_H__
#define __CROSS_ENTROPY_NVIDIA_H__
#include "../cross_entropy.h"
DESCRIPTOR
(
nvidia
)
#endif
src/infiniop/ops/cross_entropy/operator.cc
deleted
100644 → 0
View file @
6ab911c3
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/cross_entropy.h"
#ifdef ENABLE_CPU_API
#include "cpu/cross_entropy_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
#include "nvidia/cross_entropy_nvidia.cuh"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/cross_entropy_moore.h"
#endif
#ifdef ENABLE_METAX_API
#include "metax/cross_entropy_metax.h"
#endif
__INFINI_C
infiniStatus_t
infiniopCreateCrossEntropyDescriptor
(
infiniopHandle_t
handle
,
infiniopCrossEntropyDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
,
infiniopTensorDescriptor_t
target_desc
)
{
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::cross_entropy::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::cross_entropy::NAMESPACE::Descriptor **>(desc_ptr), \
y_desc, x_desc, target_desc);
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
)
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
)
#endif
#ifdef ENABLE_QY_API
CREATE
(
INFINI_DEVICE_QY
,
nvidia
)
#endif
#ifdef ENABLE_HYGON_API
CREATE
(
INFINI_DEVICE_HYGON
,
nvidia
)
#endif
#ifdef ENABLE_MOORE_API
CREATE
(
INFINI_DEVICE_MOORE
,
moore
)
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
metax
)
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CREATE
}
__INFINI_C
infiniStatus_t
infiniopGetCrossEntropyWorkspaceSize
(
infiniopCrossEntropyDescriptor_t
desc
,
size_t
*
size
)
{
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::cross_entropy::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
)
#endif
#ifdef ENABLE_ILUVATAR_API
GET
(
INFINI_DEVICE_ILUVATAR
,
nvidia
)
#endif
#ifdef ENABLE_QY_API
GET
(
INFINI_DEVICE_QY
,
nvidia
)
#endif
#ifdef ENABLE_HYGON_API
GET
(
INFINI_DEVICE_HYGON
,
nvidia
)
#endif
#ifdef ENABLE_MOORE_API
GET
(
INFINI_DEVICE_MOORE
,
moore
)
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
metax
)
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef GET
}
__INFINI_C
infiniStatus_t
infiniopCrossEntropy
(
infiniopCrossEntropyDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
const
void
*
target
,
void
*
stream
)
{
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<op::cross_entropy::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, y, x, target, stream);
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
)
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
)
#endif
#ifdef ENABLE_QY_API
CALCULATE
(
INFINI_DEVICE_QY
,
nvidia
)
#endif
#ifdef ENABLE_HYGON_API
CALCULATE
(
INFINI_DEVICE_HYGON
,
nvidia
)
#endif
#ifdef ENABLE_MOORE_API
CALCULATE
(
INFINI_DEVICE_MOORE
,
moore
)
#endif
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
)
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CALCULATE
}
__INFINI_C
infiniStatus_t
infiniopDestroyCrossEntropyDescriptor
(
infiniopCrossEntropyDescriptor_t
desc
)
{
#define DESTROY(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<op::cross_entropy::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
DESTROY
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
#ifdef ENABLE_NVIDIA_API
DESTROY
(
INFINI_DEVICE_NVIDIA
,
nvidia
)
#endif
#ifdef ENABLE_ILUVATAR_API
DESTROY
(
INFINI_DEVICE_ILUVATAR
,
nvidia
)
#endif
#ifdef ENABLE_QY_API
DESTROY
(
INFINI_DEVICE_QY
,
nvidia
)
#endif
#ifdef ENABLE_HYGON_API
DESTROY
(
INFINI_DEVICE_HYGON
,
nvidia
)
#endif
#ifdef ENABLE_MOORE_API
DESTROY
(
INFINI_DEVICE_MOORE
,
moore
)
#endif
#ifdef ENABLE_METAX_API
DESTROY
(
INFINI_DEVICE_METAX
,
metax
)
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef DESTROY
}
src/infiniop/ops/equal/cpu/equal_cpu.cc
deleted
100644 → 0
View file @
6ab911c3
#include <cstdint>
#include <type_traits>
#include "equal_cpu.h"
namespace
op
::
equal
::
cpu
{
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
)
{
auto
handle
=
reinterpret_cast
<
device
::
cpu
::
Handle
*>
(
handle_
);
const
auto
&
a_desc
=
input_desc_vec
.
at
(
0
);
const
auto
&
b_desc
=
input_desc_vec
.
at
(
1
);
auto
compute_dtype
=
a_desc
->
dtype
();
auto
out_dtype
=
out_desc
->
dtype
();
if
(
compute_dtype
!=
b_desc
->
dtype
())
{
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
CHECK_DTYPE
(
out_dtype
,
INFINI_DTYPE_BOOL
);
CHECK_DTYPE
(
compute_dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
,
INFINI_DTYPE_BF16
,
INFINI_DTYPE_I32
,
INFINI_DTYPE_I64
);
const
auto
&
c_shape
=
out_desc
->
shape
();
const
auto
&
a_shape
=
a_desc
->
shape
();
const
auto
&
b_shape
=
b_desc
->
shape
();
CHECK_SAME_SHAPE
(
c_shape
,
a_shape
,
b_shape
);
CREATE_ELEMENTWISE_CPU_DESCRIPTOR
(
handle
,
compute_dtype
,
out_desc
,
input_desc_vec
);
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
{
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
return
_device_info
->
calculate
<
EqualOp
,
bool
,
fp16_t
,
fp16_t
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F32
:
return
_device_info
->
calculate
<
EqualOp
,
bool
,
float
,
float
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
EqualOp
,
bool
,
double
,
double
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_BF16
:
return
_device_info
->
calculate
<
EqualOp
,
bool
,
bf16_t
,
bf16_t
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_I32
:
return
_device_info
->
calculate
<
EqualOp
,
bool
,
int32_t
,
int32_t
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_I64
:
return
_device_info
->
calculate
<
EqualOp
,
bool
,
int64_t
,
int64_t
>
(
_info
,
output
,
inputs
,
stream
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::equal::cpu
src/infiniop/ops/equal/cpu/equal_cpu.h
deleted
100644 → 0
View file @
6ab911c3
#ifndef __EQUAL_CPU_H__
#define __EQUAL_CPU_H__
#include <type_traits>
#include "../../../elementwise/cpu/elementwise_cpu.h"
ELEMENTWISE_DESCRIPTOR
(
equal
,
cpu
)
namespace
op
::
equal
::
cpu
{
typedef
struct
EqualOp
{
public:
static
constexpr
size_t
num_inputs
=
2
;
template
<
typename
Tout
,
typename
Tin0
,
typename
Tin1
>
bool
operator
()(
const
Tin0
&
a
,
const
Tin1
&
b
)
{
if
constexpr
(
std
::
is_same_v
<
Tin0
,
Tin1
>
)
{
return
a
==
b
;
}
else
{
return
false
;
}
}
}
EqualOp
;
}
// namespace op::equal::cpu
#endif
src/infiniop/ops/equal/cuda/kernel.cuh
deleted
100644 → 0
View file @
6ab911c3
#ifndef __EQUAL_CUDA_H__
#define __EQUAL_CUDA_H__
#if defined(__MACACC__)
#include <maca_bfloat16.h>
#include <maca_fp16.h>
#else
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#endif
#include <type_traits>
namespace
op
::
equal
::
cuda
{
typedef
struct
EqualOp
{
public:
static
constexpr
size_t
num_inputs
=
2
;
template
<
typename
Tout
,
typename
Tin0
,
typename
Tin1
>
__device__
__forceinline__
bool
operator
()(
const
Tin0
&
a
,
const
Tin1
&
b
)
const
{
if
constexpr
(
std
::
is_same_v
<
Tin0
,
Tin1
>
)
{
if
constexpr
(
std
::
is_same_v
<
Tin0
,
half2
>
)
{
static_assert
(
!
std
::
is_same_v
<
Tin0
,
half2
>
,
"half2 is not supported for mixed output dtype"
);
}
else
if
constexpr
(
std
::
is_same_v
<
Tin0
,
half
>
)
{
return
static_cast
<
Tout
>
(
__heq
(
a
,
b
));
}
else
{
return
static_cast
<
Tout
>
(
a
==
b
);
}
}
else
{
return
false
;
}
}
}
EqualOp
;
}
// namespace op::equal::cuda
#endif
src/infiniop/ops/equal/metax/equal_metax.h
deleted
100644 → 0
View file @
6ab911c3
#ifndef __EQUAL_METAX_API_H__
#define __EQUAL_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR
(
equal
,
metax
)
#endif // __EQUAL_METAX_API_H__
src/infiniop/ops/equal/metax/equal_metax.maca
deleted
100644 → 0
View file @
6ab911c3
#include "equal_metax.h"
#include "../../../elementwise/metax/elementwise_metax.h"
#include "../cuda/kernel.cuh"
namespace op::equal::metax {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
const auto &a_desc = input_desc_vec.at(0);
auto compute_dtype = a_desc->dtype();
auto out_dtype = out_desc->dtype();
const auto &b_desc = input_desc_vec.at(1);
const auto &c_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
const auto &b_shape = b_desc->shape();
CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16,
INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64);
CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL);
CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::EqualOp, bool, half, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::EqualOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::EqualOp, bool, float, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I32:
return _device_info->calculate<256, cuda::EqualOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I64:
return _device_info->calculate<256, cuda::EqualOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::EqualOp, bool, double, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
} // namespace op::equal::metax
src/infiniop/ops/equal/moore/equal_moore.h
deleted
100644 → 0
View file @
6ab911c3
#ifndef __EQUAL_MOORE_API_H__
#define __EQUAL_MOORE_API_H__
#include "../../../elementwise/moore/elementwise_moore_api.h"
ELEMENTWISE_DESCRIPTOR
(
equal
,
moore
)
#endif // __EQUAL_MOORE_API_H__
src/infiniop/ops/equal/moore/equal_moore.mu
deleted
100644 → 0
View file @
6ab911c3
#include "equal_moore.h"
#include "../../../elementwise/moore/elementwise_moore.h"
#include "equal_moore_kernel.h"
namespace op::equal::moore {
namespace {
inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) {
if (!info.isOutputContiguous()) {
return false;
}
const bool *input_contiguous = info.getInputContiguous();
const bool *input_broadcasted = info.getInputBroadcasted();
for (size_t i = 0; i < 2; ++i) {
if (!input_contiguous[i] || input_broadcasted[i]) {
return false;
}
}
return true;
}
template <typename Tout, typename Tin>
INFINIOP_MOORE_KERNEL equal_contiguous_kernel(size_t numel, Tout *output, const Tin *a, const Tin *b) {
const auto op = op::equal::moore::EqualOp{};
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
size_t stride = blockDim.x * gridDim.x;
for (; idx < numel; idx += stride) {
output[idx] = op.template operator()<Tout, Tin>(a[idx], b[idx]);
}
}
template <typename Tout, typename Tin>
infiniStatus_t launch_fast_path(size_t numel,
void *output,
const std::vector<const void *> &inputs,
void *stream) {
if (numel == 0) {
return INFINI_STATUS_SUCCESS;
}
constexpr int kBlockSize = 256;
int grid = static_cast<int>((numel + kBlockSize - 1) / kBlockSize);
if (grid > 65535) {
grid = 65535;
}
auto musa_stream = reinterpret_cast<musaStream_t>(stream);
equal_contiguous_kernel<Tout, Tin><<<grid, kBlockSize, 0, musa_stream>>>(
numel,
reinterpret_cast<Tout *>(output),
reinterpret_cast<const Tin *>(inputs[0]),
reinterpret_cast<const Tin *>(inputs[1]));
return INFINI_STATUS_SUCCESS;
}
} // namespace
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
const auto &a_desc = input_desc_vec.at(0);
auto compute_dtype = a_desc->dtype();
auto out_dtype = out_desc->dtype();
const auto &b_desc = input_desc_vec.at(1);
const auto &c_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
const auto &b_shape = b_desc->shape();
CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16,
INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64);
CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL);
CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
// create MOORE elementwise descriptor
CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (can_use_contiguous_fast_path(_info)) {
size_t numel = _info.getOutputSize();
switch (_dtype) {
case INFINI_DTYPE_F16:
return launch_fast_path<bool, half>(numel, output, inputs, stream);
case INFINI_DTYPE_BF16:
return launch_fast_path<bool, cuda_bfloat16>(numel, output, inputs, stream);
case INFINI_DTYPE_F32:
return launch_fast_path<bool, float>(numel, output, inputs, stream);
case INFINI_DTYPE_I32:
return launch_fast_path<bool, int32_t>(numel, output, inputs, stream);
case INFINI_DTYPE_I64:
return launch_fast_path<bool, int64_t>(numel, output, inputs, stream);
case INFINI_DTYPE_F64:
return launch_fast_path<bool, double>(numel, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, moore::EqualOp, bool, half, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, moore::EqualOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, moore::EqualOp, bool, float, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I32:
return _device_info->calculate<256, moore::EqualOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I64:
return _device_info->calculate<256, moore::EqualOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, moore::EqualOp, bool, double, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
} // namespace op::equal::moore
src/infiniop/ops/equal/moore/equal_moore_kernel.h
deleted
100644 → 0
View file @
6ab911c3
#ifndef __EQUAL_MOORE_KERNEL_H__
#define __EQUAL_MOORE_KERNEL_H__
#include <type_traits>
namespace
op
::
equal
::
moore
{
typedef
struct
EqualOp
{
public:
static
constexpr
size_t
num_inputs
=
2
;
template
<
typename
Tout
,
typename
Tin0
,
typename
Tin1
>
__device__
__forceinline__
bool
operator
()(
const
Tin0
&
a
,
const
Tin1
&
b
)
const
{
if
constexpr
(
std
::
is_same_v
<
Tin0
,
Tin1
>
)
{
if
constexpr
(
std
::
is_same_v
<
Tin0
,
half
>
)
{
return
__half2float
(
a
)
==
__half2float
(
b
);
}
else
if
constexpr
(
std
::
is_same_v
<
Tin0
,
cuda_bfloat16
>
)
{
return
__bfloat162float
(
a
)
==
__bfloat162float
(
b
);
}
else
{
return
a
==
b
;
}
}
else
{
return
false
;
}
}
}
EqualOp
;
}
// namespace op::equal::moore
#endif // __EQUAL_MOORE_KERNEL_H__
src/infiniop/ops/equal/nvidia/equal_nvidia.cu
deleted
100644 → 0
View file @
6ab911c3
#include <algorithm>
#include <cstdint>
#include <type_traits>
#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
#include "../cuda/kernel.cuh"
#include "equal_nvidia.cuh"
namespace
{
template
<
typename
Tout
,
typename
Tin
>
INFINIOP_CUDA_KERNEL
FastEqualKernel
(
size_t
n
,
Tout
*
output
,
const
Tin
*
a
,
const
Tin
*
b
)
{
size_t
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
size_t
stride
=
blockDim
.
x
*
gridDim
.
x
;
op
::
equal
::
cuda
::
EqualOp
op
{};
for
(;
idx
<
n
;
idx
+=
stride
)
{
output
[
idx
]
=
op
.
template
operator
()
<
Tout
,
Tin
>(
a
[
idx
],
b
[
idx
]);
}
}
template
<
typename
Tout
,
typename
Tin
>
infiniStatus_t
launchFastEqualKernel
(
size_t
numel
,
void
*
output
,
const
std
::
vector
<
const
void
*>
&
inputs
,
void
*
stream
)
{
if
(
numel
==
0
)
{
return
INFINI_STATUS_SUCCESS
;
}
constexpr
int
block
=
256
;
int
grid
=
static_cast
<
int
>
((
numel
+
block
-
1
)
/
block
);
grid
=
std
::
min
(
grid
,
65535
);
auto
cuda_stream
=
reinterpret_cast
<
cudaStream_t
>
(
stream
);
FastEqualKernel
<
Tout
,
Tin
><<<
grid
,
block
,
0
,
cuda_stream
>>>
(
numel
,
reinterpret_cast
<
Tout
*>
(
output
),
reinterpret_cast
<
const
Tin
*>
(
inputs
[
0
]),
reinterpret_cast
<
const
Tin
*>
(
inputs
[
1
]));
auto
err
=
cudaGetLastError
();
return
err
==
cudaSuccess
?
INFINI_STATUS_SUCCESS
:
INFINI_STATUS_INTERNAL_ERROR
;
}
}
// namespace
namespace
op
::
equal
::
nvidia
{
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
)
{
auto
handle
=
reinterpret_cast
<
device
::
nvidia
::
Handle
*>
(
handle_
);
const
auto
&
a_desc
=
input_desc_vec
.
at
(
0
);
auto
compute_dtype
=
a_desc
->
dtype
();
auto
out_dtype
=
out_desc
->
dtype
();
const
auto
&
b_desc
=
input_desc_vec
.
at
(
1
);
const
auto
&
c_shape
=
out_desc
->
shape
();
const
auto
&
a_shape
=
a_desc
->
shape
();
const
auto
&
b_shape
=
b_desc
->
shape
();
CHECK_DTYPE
(
compute_dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_BF16
,
INFINI_DTYPE_I32
,
INFINI_DTYPE_I64
,
INFINI_DTYPE_F64
);
CHECK_DTYPE
(
out_dtype
,
INFINI_DTYPE_BOOL
,
INFINI_DTYPE_U8
,
INFINI_DTYPE_I8
);
CHECK_SAME_SHAPE
(
c_shape
,
a_shape
,
b_shape
);
CREATE_ELEMENTWISE_CUDA_DESCRIPTOR
(
handle
,
compute_dtype
,
out_desc
,
input_desc_vec
)
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
{
bool
fast_path
=
_info
.
isOutputContiguous
();
if
(
fast_path
)
{
const
bool
*
input_contiguous
=
_info
.
getInputContiguous
();
const
bool
*
input_broadcasted
=
_info
.
getInputBroadcasted
();
for
(
size_t
i
=
0
;
i
<
2
;
++
i
)
{
fast_path
&=
input_contiguous
[
i
]
&&
!
input_broadcasted
[
i
];
}
}
if
(
fast_path
)
{
size_t
numel
=
_info
.
getOutputSize
();
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
return
launchFastEqualKernel
<
bool
,
half
>
(
numel
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_BF16
:
return
launchFastEqualKernel
<
bool
,
cuda_bfloat16
>
(
numel
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F32
:
return
launchFastEqualKernel
<
bool
,
float
>
(
numel
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_I32
:
return
launchFastEqualKernel
<
bool
,
int32_t
>
(
numel
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_I64
:
return
launchFastEqualKernel
<
bool
,
int64_t
>
(
numel
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
return
launchFastEqualKernel
<
bool
,
double
>
(
numel
,
output
,
inputs
,
stream
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
}
if
(
workspace_size
<
_workspace_size
)
{
return
INFINI_STATUS_INSUFFICIENT_WORKSPACE
;
}
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
return
_device_info
->
calculate
<
256
,
cuda
::
EqualOp
,
bool
,
half
,
half
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_BF16
:
return
_device_info
->
calculate
<
256
,
cuda
::
EqualOp
,
bool
,
cuda_bfloat16
,
cuda_bfloat16
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F32
:
return
_device_info
->
calculate
<
256
,
cuda
::
EqualOp
,
bool
,
float
,
float
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_I32
:
return
_device_info
->
calculate
<
256
,
cuda
::
EqualOp
,
bool
,
int32_t
,
int32_t
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_I64
:
return
_device_info
->
calculate
<
256
,
cuda
::
EqualOp
,
bool
,
int64_t
,
int64_t
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
256
,
cuda
::
EqualOp
,
bool
,
double
,
double
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::equal::nvidia
src/infiniop/ops/equal/nvidia/equal_nvidia.cuh
deleted
100644 → 0
View file @
6ab911c3
#ifndef __EQUAL_CUDA_API_H__
#define __EQUAL_CUDA_API_H__
#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
ELEMENTWISE_DESCRIPTOR
(
equal
,
nvidia
)
#endif
src/infiniop/ops/equal/operator.cc
deleted
100644 → 0
View file @
6ab911c3
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/equal.h"
#ifdef ENABLE_CPU_API
#include "cpu/equal_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
#include "nvidia/equal_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
#include "metax/equal_metax.h"
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/equal_kunlun.h"
#endif
#ifdef ENABLE_CAMBRICON_API
#include "bang/equal_bang.h"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/equal_moore.h"
#endif
__INFINI_C
infiniStatus_t
infiniopCreateEqualDescriptor
(
infiniopHandle_t
handle
,
infiniopEqualDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
)
{
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::equal::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::equal::NAMESPACE::Descriptor **>(desc_ptr), \
c_desc, \
{a_desc, b_desc})
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
CREATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
CREATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_CAMBRICON_API
CREATE
(
INFINI_DEVICE_CAMBRICON
,
bang
);
#endif
#ifdef ENABLE_MOORE_API
CREATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CREATE
}
__INFINI_C
infiniStatus_t
infiniopGetEqualWorkspaceSize
(
infiniopEqualDescriptor_t
desc
,
size_t
*
size
)
{
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::equal::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
GET
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
GET
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
GET
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_CAMBRICON_API
GET
(
INFINI_DEVICE_CAMBRICON
,
bang
);
#endif
#ifdef ENABLE_MOORE_API
GET
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef GET
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
__INFINI_C
infiniStatus_t
infiniopEqual
(
infiniopEqualDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
c
,
const
void
*
a
,
const
void
*
b
,
void
*
stream
)
{
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::equal::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, c, {a, b}, stream)
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
CALCULATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_CAMBRICON_API
CALCULATE
(
INFINI_DEVICE_CAMBRICON
,
bang
);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CALCULATE
}
__INFINI_C
infiniStatus_t
infiniopDestroyEqualDescriptor
(
infiniopEqualDescriptor_t
desc
)
{
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::equal::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
DELETE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
DELETE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_CAMBRICON_API
DELETE
(
INFINI_DEVICE_CAMBRICON
,
bang
);
#endif
#ifdef ENABLE_MOORE_API
DELETE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef DELETE
}
src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc
deleted
100644 → 0
View file @
6ab911c3
#include "hardswish_cpu.h"
#include <cstddef>
namespace
op
::
hardswish
::
cpu
{
namespace
{
inline
bool
can_use_contiguous_fast_path
(
const
op
::
elementwise
::
ElementwiseInfo
&
info
)
{
return
info
.
isOutputContiguous
()
&&
info
.
getInputSize
()
==
1
&&
info
.
getInputContiguous
()[
0
]
&&
!
info
.
getInputBroadcasted
()[
0
];
}
template
<
typename
T
>
infiniStatus_t
launch_contiguous_cpu
(
const
op
::
elementwise
::
ElementwiseInfo
&
info
,
void
*
output
,
const
std
::
vector
<
const
void
*>
&
inputs
)
{
const
T
*
in
=
reinterpret_cast
<
const
T
*>
(
inputs
[
0
]);
T
*
out
=
reinterpret_cast
<
T
*>
(
output
);
const
ptrdiff_t
size
=
static_cast
<
ptrdiff_t
>
(
info
.
getOutputSize
());
#pragma omp parallel for if (size > 1024)
for
(
ptrdiff_t
i
=
0
;
i
<
size
;
++
i
)
{
out
[
i
]
=
HardSwishOp
{}(
in
[
i
]);
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
)
{
auto
handle
=
reinterpret_cast
<
device
::
cpu
::
Handle
*>
(
handle_
);
auto
dtype
=
out_desc
->
dtype
();
const
auto
&
input_desc
=
input_desc_vec
.
at
(
0
);
const
auto
&
output_shape
=
out_desc
->
shape
();
const
auto
&
input_shape
=
input_desc
->
shape
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_BF16
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
);
CHECK_SAME_SHAPE
(
output_shape
,
input_shape
);
CREATE_ELEMENTWISE_CPU_DESCRIPTOR
(
handle
,
dtype
,
out_desc
,
input_desc_vec
);
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
{
const
bool
fast_path
=
can_use_contiguous_fast_path
(
_info
);
if
(
fast_path
)
{
switch
(
_dtype
)
{
case
INFINI_DTYPE_BF16
:
return
launch_contiguous_cpu
<
bf16_t
>
(
_info
,
output
,
inputs
);
case
INFINI_DTYPE_F16
:
return
launch_contiguous_cpu
<
fp16_t
>
(
_info
,
output
,
inputs
);
case
INFINI_DTYPE_F32
:
return
launch_contiguous_cpu
<
float
>
(
_info
,
output
,
inputs
);
case
INFINI_DTYPE_F64
:
return
launch_contiguous_cpu
<
double
>
(
_info
,
output
,
inputs
);
default:
break
;
}
}
switch
(
_dtype
)
{
case
INFINI_DTYPE_BF16
:
return
_device_info
->
calculate
<
HardSwishOp
,
bf16_t
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F16
:
return
_device_info
->
calculate
<
HardSwishOp
,
fp16_t
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F32
:
return
_device_info
->
calculate
<
HardSwishOp
,
float
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
HardSwishOp
,
double
>
(
_info
,
output
,
inputs
,
stream
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::hardswish::cpu
src/infiniop/ops/hardswish/cpu/hardswish_cpu.h
deleted
100644 → 0
View file @
6ab911c3
#ifndef __HARDSWISH_CPU_H__
#define __HARDSWISH_CPU_H__
#include "../../../elementwise/cpu/elementwise_cpu.h"
ELEMENTWISE_DESCRIPTOR
(
hardswish
,
cpu
)
#include <algorithm>
#include <cmath>
namespace
op
::
hardswish
::
cpu
{
typedef
struct
HardSwishOp
{
public:
static
constexpr
size_t
num_inputs
=
1
;
template
<
typename
T
>
T
operator
()(
const
T
&
x
)
const
{
const
float
x_f
=
utils
::
cast
<
float
>
(
x
);
const
float
clamped
=
std
::
min
(
std
::
max
(
x_f
+
3.0
f
,
0.0
f
),
6.0
f
);
const
float
result
=
x_f
*
clamped
*
(
1.0
f
/
6.0
f
);
return
utils
::
cast
<
T
>
(
result
);
}
}
HardSwishOp
;
typedef
struct
HardSwishContiguousOp
{
public:
static
constexpr
size_t
num_inputs
=
1
;
template
<
typename
T
>
T
operator
()(
const
T
&
x
)
const
{
T
three
=
static_cast
<
T
>
(
3
);
T
zero
=
static_cast
<
T
>
(
0
);
T
six
=
static_cast
<
T
>
(
6
);
T
scale
=
static_cast
<
T
>
(
0.16666667
f
);
T
val
=
x
+
three
;
val
=
std
::
max
(
zero
,
val
);
val
=
std
::
min
(
six
,
val
);
return
x
*
val
*
scale
;
}
}
HardSwishContiguousOp
;
}
// namespace op::hardswish::cpu
#endif
src/infiniop/ops/hardswish/cuda/kernel.cuh
deleted
100644 → 0
View file @
6ab911c3
#ifndef __HARDSWISH_CUDA_H__
#define __HARDSWISH_CUDA_H__
#include <cmath>
#if defined(__MACACC__)
#include <maca_bfloat16.h>
#include <maca_fp16.h>
#else
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#endif
namespace
op
::
hardswish
::
cuda
{
typedef
struct
HardSwishOp
{
public:
static
constexpr
size_t
num_inputs
=
1
;
template
<
typename
T
>
__device__
__forceinline__
T
operator
()(
const
T
&
x
)
const
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
)
{
const
half2
three
=
__float2half2_rn
(
3.0
f
);
const
half2
scale
=
__float2half2_rn
(
0.16666667
f
);
half2
val
=
__hadd2
(
x
,
three
);
#if defined(ENABLE_ILUVATAR_API)
float2
val_f
=
__half22float2
(
val
);
val_f
.
x
=
fminf
(
fmaxf
(
val_f
.
x
,
0.0
f
),
6.0
f
);
val_f
.
y
=
fminf
(
fmaxf
(
val_f
.
y
,
0.0
f
),
6.0
f
);
val
=
__floats2half2_rn
(
val_f
.
x
,
val_f
.
y
);
#else
const
half2
zero
=
__float2half2_rn
(
0.0
f
);
const
half2
six
=
__float2half2_rn
(
6.0
f
);
#if __CUDA_ARCH__ >= 800
val
=
__hmin2
(
__hmax2
(
val
,
zero
),
six
);
#else
val
=
__hmax2
(
val
,
zero
);
val
=
__hmin2
(
val
,
six
);
#endif
#endif
return
__hmul2
(
__hmul2
(
x
,
val
),
scale
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
cuda_bfloat16
>
)
{
const
float
x_f
=
__bfloat162float
(
x
);
const
float
val
=
fminf
(
fmaxf
(
x_f
+
3.0
f
,
0.0
f
),
6.0
f
);
return
__float2bfloat16
(
x_f
*
val
*
0.16666667
f
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
const
float
x_f
=
__half2float
(
x
);
const
float
val
=
fminf
(
fmaxf
(
x_f
+
3.0
f
,
0.0
f
),
6.0
f
);
return
__float2half
(
x_f
*
val
*
0.16666667
f
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
const
float
val
=
fminf
(
fmaxf
(
x
+
3.0
f
,
0.0
f
),
6.0
f
);
return
x
*
val
*
0.16666667
f
;
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
double
>
)
{
const
double
val
=
fmin
(
fmax
(
x
+
3.0
,
0.0
),
6.0
);
return
x
*
val
*
(
1.0
/
6.0
);
}
}
}
HardSwishOp
;
}
// namespace op::hardswish::cuda
#endif
src/infiniop/ops/hardswish/metax/hardswish_metax.h
deleted
100644 → 0
View file @
6ab911c3
#ifndef __HARDSWISH_METAX_API_H__
#define __HARDSWISH_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR
(
hardswish
,
metax
)
#endif // __HARDSWISH_METAX_API_H__
src/infiniop/ops/hardswish/metax/hardswish_metax.maca
deleted
100644 → 0
View file @
6ab911c3
#include "hardswish_metax.h"
#include "../../../elementwise/metax/elementwise_metax.h"
#include "../cuda/kernel.cuh"
namespace op::hardswish::metax {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &input_desc = input_desc_vec.at(0);
const auto &output_shape = out_desc->shape();
const auto &input_shape = input_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_SAME_SHAPE(output_shape, input_shape);
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::HardSwishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::HardSwishOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::HardSwishOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::HardSwishOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
} // namespace op::hardswish::metax
Prev
1
2
3
4
5
6
7
8
9
10
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment