Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
93191613
Unverified
Commit
93191613
authored
Mar 13, 2026
by
thatPepe
Committed by
GitHub
Mar 13, 2026
Browse files
Merge pull request #1075 from InfiniTensor/RevertT_1-1-4
Revert T1-1-4
parents
6ab911c3
def22a08
Changes
203
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
1583 deletions
+0
-1583
src/infiniop/ops/hardswish/moore/hardswish_moore.h
src/infiniop/ops/hardswish/moore/hardswish_moore.h
+0
-8
src/infiniop/ops/hardswish/moore/hardswish_moore.mu
src/infiniop/ops/hardswish/moore/hardswish_moore.mu
+0
-118
src/infiniop/ops/hardswish/moore/hardswish_moore_kernel.h
src/infiniop/ops/hardswish/moore/hardswish_moore_kernel.h
+0
-39
src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu
src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu
+0
-115
src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh
src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh
+0
-8
src/infiniop/ops/hardswish/operator.cc
src/infiniop/ops/hardswish/operator.cc
+0
-157
src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.cc
src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.cc
+0
-124
src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.h
src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.h
+0
-63
src/infiniop/ops/hardtanh/cuda/kernel.cuh
src/infiniop/ops/hardtanh/cuda/kernel.cuh
+0
-51
src/infiniop/ops/hardtanh/metax/hardtanh_metax.h
src/infiniop/ops/hardtanh/metax/hardtanh_metax.h
+0
-48
src/infiniop/ops/hardtanh/metax/hardtanh_metax.maca
src/infiniop/ops/hardtanh/metax/hardtanh_metax.maca
+0
-95
src/infiniop/ops/hardtanh/moore/hardtanh_moore.h
src/infiniop/ops/hardtanh/moore/hardtanh_moore.h
+0
-51
src/infiniop/ops/hardtanh/moore/hardtanh_moore.mu
src/infiniop/ops/hardtanh/moore/hardtanh_moore.mu
+0
-158
src/infiniop/ops/hardtanh/moore/hardtanh_moore_kernel.h
src/infiniop/ops/hardtanh/moore/hardtanh_moore_kernel.h
+0
-34
src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cu
src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cu
+0
-150
src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cuh
src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cuh
+0
-51
src/infiniop/ops/hardtanh/operator.cc
src/infiniop/ops/hardtanh/operator.cc
+0
-161
src/infiniop/ops/sum/cpu/sum_cpu.cc
src/infiniop/ops/sum/cpu/sum_cpu.cc
+0
-70
src/infiniop/ops/sum/cpu/sum_cpu.h
src/infiniop/ops/sum/cpu/sum_cpu.h
+0
-8
src/infiniop/ops/sum/cuda/kernel.cuh
src/infiniop/ops/sum/cuda/kernel.cuh
+0
-74
No files found.
src/infiniop/ops/hardswish/moore/hardswish_moore.h
deleted
100644 → 0
View file @
6ab911c3
#ifndef __HARDSWISH_MOORE_API_H__
#define __HARDSWISH_MOORE_API_H__
#include "../../../elementwise/moore/elementwise_moore_api.h"
ELEMENTWISE_DESCRIPTOR
(
hardswish
,
moore
)
#endif // __HARDSWISH_MOORE_API_H__
src/infiniop/ops/hardswish/moore/hardswish_moore.mu
deleted
100644 → 0
View file @
6ab911c3
#include "hardswish_moore.h"
#include "../../../elementwise/moore/elementwise_moore.h"
#include "hardswish_moore_kernel.h"
namespace op::hardswish::moore {
namespace {
inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) {
return info.isOutputContiguous() && info.getInputSize() == 1 &&
info.getInputContiguous()[0] && !info.getInputBroadcasted()[0];
}
template <typename T>
INFINIOP_MOORE_KERNEL hardswish_contiguous_kernel(size_t numel, T *out, const T *in) {
const auto op = op::hardswish::moore::HardSwishOp{};
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
size_t stride = blockDim.x * gridDim.x;
for (; idx < numel; idx += stride) {
out[idx] = op(in[idx]);
}
}
template <typename T>
infiniStatus_t launch_fast_path(size_t numel,
void *output,
const std::vector<const void *> &inputs,
void *stream) {
if (numel == 0) {
return INFINI_STATUS_SUCCESS;
}
constexpr int kBlockSize = 256;
int grid = static_cast<int>((numel + kBlockSize - 1) / kBlockSize);
if (grid > 65535) {
grid = 65535;
}
auto musa_stream = reinterpret_cast<musaStream_t>(stream);
hardswish_contiguous_kernel<T><<<grid, kBlockSize, 0, musa_stream>>>(
numel,
reinterpret_cast<T *>(output),
reinterpret_cast<const T *>(inputs[0]));
return INFINI_STATUS_SUCCESS;
}
} // namespace
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &input_desc = input_desc_vec.at(0);
const auto &output_shape = out_desc->shape();
const auto &input_shape = input_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_SAME_SHAPE(output_shape, input_shape);
// create MOORE elementwise descriptor
CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
const bool fast_path = can_use_contiguous_fast_path(_info);
if (fast_path) {
switch (_dtype) {
case INFINI_DTYPE_BF16:
return launch_fast_path<cuda_bfloat16>(_info.getOutputSize(), output, inputs, stream);
case INFINI_DTYPE_F16:
return launch_fast_path<half>(_info.getOutputSize(), output, inputs, stream);
case INFINI_DTYPE_F32:
return launch_fast_path<float>(_info.getOutputSize(), output, inputs, stream);
case INFINI_DTYPE_F64:
return launch_fast_path<double>(_info.getOutputSize(), output, inputs, stream);
default:
break;
}
}
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, moore::HardSwishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F16:
return _device_info->calculate<256, moore::HardSwishOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, moore::HardSwishOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, moore::HardSwishOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::hardswish::moore
src/infiniop/ops/hardswish/moore/hardswish_moore_kernel.h
deleted
100644 → 0
View file @
6ab911c3
#ifndef __HARDSWISH_MOORE_KERNEL_H__
#define __HARDSWISH_MOORE_KERNEL_H__
#include <cmath>
#include <type_traits>
namespace
op
::
hardswish
::
moore
{
typedef
struct
HardSwishOp
{
public:
static
constexpr
size_t
num_inputs
=
1
;
template
<
typename
T
>
__device__
__forceinline__
T
operator
()(
const
T
&
x
)
const
{
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
float
x_f
=
__half2float
(
x
);
float
val
=
fminf
(
fmaxf
(
x_f
+
3.0
f
,
0.0
f
),
6.0
f
);
return
__float2half
(
x_f
*
val
*
0.16666667
f
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
cuda_bfloat16
>
)
{
float
x_f
=
__bfloat162float
(
x
);
float
val
=
fminf
(
fmaxf
(
x_f
+
3.0
f
,
0.0
f
),
6.0
f
);
return
__float2bfloat16_rn
(
x_f
*
val
*
0.16666667
f
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
float
val
=
fminf
(
fmaxf
(
x
+
3.0
f
,
0.0
f
),
6.0
f
);
return
x
*
val
*
0.16666667
f
;
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
double
>
)
{
double
val
=
fmin
(
fmax
(
x
+
3.0
,
0.0
),
6.0
);
return
x
*
val
*
(
1.0
/
6.0
);
}
else
{
float
x_f
=
static_cast
<
float
>
(
x
);
float
val
=
fminf
(
fmaxf
(
x_f
+
3.0
f
,
0.0
f
),
6.0
f
);
return
static_cast
<
T
>
(
x_f
*
val
*
0.16666667
f
);
}
}
}
HardSwishOp
;
}
// namespace op::hardswish::moore
#endif // __HARDSWISH_MOORE_KERNEL_H__
src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu
deleted
100644 → 0
View file @
6ab911c3
#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
#include "../cuda/kernel.cuh"
#include "hardswish_nvidia.cuh"
#include <cuda_runtime.h>
namespace
op
::
hardswish
::
nvidia
{
namespace
{
inline
bool
can_use_contiguous_fast_path
(
const
op
::
elementwise
::
ElementwiseInfo
&
info
)
{
return
info
.
isOutputContiguous
()
&&
info
.
getInputSize
()
==
1
&&
info
.
getInputContiguous
()[
0
]
&&
!
info
.
getInputBroadcasted
()[
0
];
}
template
<
typename
T
>
__global__
void
hardswish_contiguous_kernel
(
size_t
numel
,
T
*
out
,
const
T
*
in
)
{
const
auto
op
=
op
::
hardswish
::
cuda
::
HardSwishOp
{};
size_t
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
while
(
idx
<
numel
)
{
out
[
idx
]
=
op
(
in
[
idx
]);
idx
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
template
<
typename
T
>
infiniStatus_t
launch_fast_path
(
size_t
numel
,
void
*
output
,
const
std
::
vector
<
const
void
*>
&
inputs
,
void
*
stream
)
{
if
(
numel
==
0
)
{
return
INFINI_STATUS_SUCCESS
;
}
constexpr
int
BLOCK_SIZE
=
256
;
int
grid
=
static_cast
<
int
>
((
numel
+
BLOCK_SIZE
-
1
)
/
BLOCK_SIZE
);
grid
=
std
::
min
(
grid
,
65535
);
auto
*
out_ptr
=
reinterpret_cast
<
T
*>
(
output
);
auto
*
in_ptr
=
reinterpret_cast
<
const
T
*>
(
inputs
[
0
]);
auto
cuda_stream
=
reinterpret_cast
<
cudaStream_t
>
(
stream
);
hardswish_contiguous_kernel
<<<
grid
,
BLOCK_SIZE
,
0
,
cuda_stream
>>>
(
numel
,
out_ptr
,
in_ptr
);
cudaError_t
err
=
cudaGetLastError
();
return
err
==
cudaSuccess
?
INFINI_STATUS_SUCCESS
:
INFINI_STATUS_INTERNAL_ERROR
;
}
}
// namespace
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
)
{
auto
handle
=
reinterpret_cast
<
device
::
nvidia
::
Handle
*>
(
handle_
);
auto
dtype
=
out_desc
->
dtype
();
const
auto
&
input_desc
=
input_desc_vec
.
at
(
0
);
const
auto
&
output_shape
=
out_desc
->
shape
();
const
auto
&
input_shape
=
input_desc
->
shape
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_BF16
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
);
CHECK_SAME_SHAPE
(
output_shape
,
input_shape
);
CREATE_ELEMENTWISE_CUDA_DESCRIPTOR
(
handle
,
dtype
,
out_desc
,
input_desc_vec
)
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
{
const
bool
fast_path
=
can_use_contiguous_fast_path
(
_info
);
if
(
fast_path
)
{
switch
(
_dtype
)
{
case
INFINI_DTYPE_BF16
:
return
launch_fast_path
<
cuda_bfloat16
>
(
_info
.
getOutputSize
(),
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F16
:
return
launch_fast_path
<
half
>
(
_info
.
getOutputSize
(),
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F32
:
return
launch_fast_path
<
float
>
(
_info
.
getOutputSize
(),
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
return
launch_fast_path
<
double
>
(
_info
.
getOutputSize
(),
output
,
inputs
,
stream
);
default:
break
;
}
}
if
(
workspace_size
<
_workspace_size
)
{
return
INFINI_STATUS_INSUFFICIENT_WORKSPACE
;
}
switch
(
_dtype
)
{
case
INFINI_DTYPE_BF16
:
return
_device_info
->
calculate
<
256
,
cuda
::
HardSwishOp
,
cuda_bfloat16
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F16
:
return
_device_info
->
calculate
<
256
,
cuda
::
HardSwishOp
,
half
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F32
:
return
_device_info
->
calculate
<
256
,
cuda
::
HardSwishOp
,
float
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
256
,
cuda
::
HardSwishOp
,
double
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::hardswish::nvidia
src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh
deleted
100644 → 0
View file @
6ab911c3
#ifndef __HARDSWISH_CUDA_API_H__
#define __HARDSWISH_CUDA_API_H__
#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
ELEMENTWISE_DESCRIPTOR
(
hardswish
,
nvidia
)
#endif
src/infiniop/ops/hardswish/operator.cc
deleted
100644 → 0
View file @
6ab911c3
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/hardswish.h"
#ifdef ENABLE_CPU_API
#include "cpu/hardswish_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#include "nvidia/hardswish_nvidia.cuh"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/hardswish_moore.h"
#endif
#ifdef ENABLE_METAX_API
#include "metax/hardswish_metax.h"
#endif
__INFINI_C
infiniStatus_t
infiniopCreateHardSwishDescriptor
(
infiniopHandle_t
handle
,
infiniopHardSwishDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
output_desc
,
infiniopTensorDescriptor_t
input_desc
)
{
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::hardswish::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::hardswish::NAMESPACE::Descriptor **>(desc_ptr), \
output_desc, \
{input_desc})
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_MOORE_API
CREATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CREATE
}
__INFINI_C
infiniStatus_t
infiniopGetHardSwishWorkspaceSize
(
infiniopHardSwishDescriptor_t
desc
,
size_t
*
size
)
{
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::hardswish::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
GET
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_MOORE_API
GET
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
metax
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef GET
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
__INFINI_C
infiniStatus_t
infiniopHardSwish
(
infiniopHardSwishDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
const
void
*
input
,
void
*
stream
)
{
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::hardswish::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, output, {input}, stream)
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CALCULATE
}
__INFINI_C
infiniStatus_t
infiniopDestroyHardSwishDescriptor
(
infiniopHardSwishDescriptor_t
desc
)
{
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::hardswish::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_MOORE_API
DELETE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef DELETE
}
src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.cc
deleted
100644 → 0
View file @
6ab911c3
#include "hardtanh_cpu.h"
#include <type_traits>
namespace
op
::
hardtanh
::
cpu
{
Descriptor
::
Descriptor
(
infiniDtype_t
dtype
,
op
::
elementwise
::
ElementwiseInfo
info
,
size_t
workspace_size
,
infiniDevice_t
device_type
,
int
device_id
,
float
min_val
,
float
max_val
)
:
InfiniopDescriptor
{
device_type
,
device_id
},
_dtype
(
dtype
),
_info
(
std
::
move
(
info
)),
_workspace_size
(
workspace_size
),
_min_val
(
min_val
),
_max_val
(
max_val
)
{}
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
,
float
min_val
,
float
max_val
)
{
auto
handle
=
reinterpret_cast
<
device
::
cpu
::
Handle
*>
(
handle_
);
auto
dtype
=
out_desc
->
dtype
();
const
auto
&
input_desc
=
input_desc_vec
.
at
(
0
);
const
auto
&
output_shape
=
out_desc
->
shape
();
const
auto
&
input_shape
=
input_desc
->
shape
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_BF16
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
);
CHECK_SAME_SHAPE
(
output_shape
,
input_shape
);
auto
info_result
=
op
::
elementwise
::
ElementwiseInfo
::
create
(
out_desc
,
input_desc_vec
);
CHECK_RESULT
(
info_result
);
*
desc_ptr
=
new
Descriptor
(
dtype
,
info_result
.
take
(),
0
,
handle
->
device
,
handle
->
device_id
,
min_val
,
max_val
);
return
INFINI_STATUS_SUCCESS
;
}
template
<
typename
T
>
static
infiniStatus_t
launchCpuHardTanh
(
const
op
::
elementwise
::
ElementwiseInfo
&
info
,
void
*
output
,
const
std
::
vector
<
const
void
*>
&
inputs
,
float
min_val
,
float
max_val
)
{
if
(
inputs
.
empty
())
{
return
INFINI_STATUS_BAD_PARAM
;
}
T
*
out
=
reinterpret_cast
<
T
*>
(
output
);
const
T
*
in
=
reinterpret_cast
<
const
T
*>
(
inputs
[
0
]);
const
auto
ndim
=
info
.
getNdim
();
const
auto
*
output_shape
=
info
.
getOutputShape
();
const
auto
*
output_strides
=
info
.
getOutputStrides
();
const
auto
*
input_shape
=
info
.
getInputShape
(
0
);
const
auto
*
input_strides
=
info
.
getInputStrides
(
0
);
const
auto
*
input_contiguous
=
info
.
getInputContiguous
();
ptrdiff_t
output_size
=
info
.
getOutputSize
();
#pragma omp parallel for if (output_size > 1024)
for
(
ptrdiff_t
i
=
0
;
i
<
output_size
;
++
i
)
{
const
size_t
out_idx
=
info
.
isOutputContiguous
()
?
static_cast
<
size_t
>
(
i
)
:
op
::
common_cpu
::
indexToOffset
(
i
,
ndim
,
output_shape
,
output_strides
);
const
size_t
in_idx
=
input_contiguous
[
0
]
?
static_cast
<
size_t
>
(
i
)
:
op
::
common_cpu
::
indexToOffset
(
i
,
ndim
,
input_shape
,
input_strides
);
if
constexpr
(
std
::
is_same_v
<
T
,
fp16_t
>
||
std
::
is_same_v
<
T
,
bf16_t
>
)
{
float
value
=
utils
::
cast
<
float
>
(
in
[
in_idx
]);
float
clamped
=
HardTanhOp
{}(
value
,
min_val
,
max_val
);
out
[
out_idx
]
=
utils
::
cast
<
T
>
(
clamped
);
}
else
{
out
[
out_idx
]
=
HardTanhOp
{}(
in
[
in_idx
],
min_val
,
max_val
);
}
}
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
{
(
void
)
workspace
;
(
void
)
workspace_size
;
(
void
)
stream
;
if
(
inputs
.
size
()
!=
1
)
{
return
INFINI_STATUS_BAD_PARAM
;
}
switch
(
_dtype
)
{
case
INFINI_DTYPE_BF16
:
return
launchCpuHardTanh
<
bf16_t
>
(
_info
,
output
,
inputs
,
_min_val
,
_max_val
);
case
INFINI_DTYPE_F16
:
return
launchCpuHardTanh
<
fp16_t
>
(
_info
,
output
,
inputs
,
_min_val
,
_max_val
);
case
INFINI_DTYPE_F32
:
return
launchCpuHardTanh
<
float
>
(
_info
,
output
,
inputs
,
_min_val
,
_max_val
);
case
INFINI_DTYPE_F64
:
return
launchCpuHardTanh
<
double
>
(
_info
,
output
,
inputs
,
_min_val
,
_max_val
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
}
}
// namespace op::hardtanh::cpu
src/infiniop/ops/hardtanh/cpu/hardtanh_cpu.h
deleted
100644 → 0
View file @
6ab911c3
#ifndef __HARDTANH_CPU_H__
#define __HARDTANH_CPU_H__
#include "../../../elementwise/cpu/elementwise_cpu.h"
#include <algorithm>
namespace
op
::
hardtanh
::
cpu
{
class
Descriptor
final
:
public
InfiniopDescriptor
{
infiniDtype_t
_dtype
;
op
::
elementwise
::
ElementwiseInfo
_info
;
size_t
_workspace_size
;
float
_min_val
;
float
_max_val
;
Descriptor
(
infiniDtype_t
dtype
,
op
::
elementwise
::
ElementwiseInfo
info
,
size_t
workspace_size
,
infiniDevice_t
device_type
,
int
device_id
,
float
min_val
,
float
max_val
);
public:
~
Descriptor
();
size_t
workspaceSize
()
const
{
return
_workspace_size
;
}
static
infiniStatus_t
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
,
float
min_val
,
float
max_val
);
infiniStatus_t
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
;
float
minVal
()
const
{
return
_min_val
;
}
float
maxVal
()
const
{
return
_max_val
;
}
};
typedef
struct
HardTanhOp
{
public:
static
constexpr
size_t
num_inputs
=
1
;
template
<
typename
T
>
T
operator
()(
const
T
&
x
,
float
min_val
,
float
max_val
)
const
{
T
low
=
static_cast
<
T
>
(
min_val
);
T
high
=
static_cast
<
T
>
(
max_val
);
T
val
=
x
<
low
?
low
:
x
;
return
val
>
high
?
high
:
val
;
}
}
HardTanhOp
;
}
// namespace op::hardtanh::cpu
#endif
src/infiniop/ops/hardtanh/cuda/kernel.cuh
deleted
100644 → 0
View file @
6ab911c3
#ifndef __HARDTANH_CUDA_H__
#define __HARDTANH_CUDA_H__
#if defined(__MACACC__)
#include <maca_bfloat16.h>
#include <maca_fp16.h>
#else
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#endif
#include <type_traits>
namespace
op
::
hardtanh
::
cuda
{
typedef
struct
HardTanhOp
{
public:
static
constexpr
size_t
num_inputs
=
1
;
template
<
typename
T
>
__device__
__forceinline__
T
operator
()(
const
T
&
x
,
float
min_val
,
float
max_val
)
const
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
)
{
float2
x_f2
=
__half22float2
(
x
);
x_f2
.
x
=
fminf
(
max_val
,
fmaxf
(
min_val
,
x_f2
.
x
));
x_f2
.
y
=
fminf
(
max_val
,
fmaxf
(
min_val
,
x_f2
.
y
));
return
__float22half2_rn
(
x_f2
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
cuda_bfloat16
>
)
{
float
x_f
=
__bfloat162float
(
x
);
return
__float2bfloat16
(
fminf
(
max_val
,
fmaxf
(
min_val
,
x_f
)));
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
float
x_f
=
__half2float
(
x
);
return
__float2half
(
fminf
(
max_val
,
fmaxf
(
min_val
,
x_f
)));
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
return
fminf
(
max_val
,
fmaxf
(
min_val
,
x
));
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
double
>
)
{
return
fmin
((
double
)
max_val
,
fmax
((
double
)
min_val
,
x
));
}
}
}
HardTanhOp
;
}
// namespace op::hardtanh::cuda
#endif
src/infiniop/ops/hardtanh/metax/hardtanh_metax.h
deleted
100644 → 0
View file @
6ab911c3
#ifndef __HARDTANH_METAX_API_H__
#define __HARDTANH_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax_api.h"
namespace
op
::
hardtanh
::
metax
{
class
Descriptor
final
:
public
InfiniopDescriptor
{
infiniDtype_t
_dtype
;
op
::
elementwise
::
ElementwiseInfo
_info
;
std
::
unique_ptr
<
op
::
elementwise
::
metax
::
DeviceImpl
>
_device_info
;
size_t
_workspace_size
;
float
_min_val
;
float
_max_val
;
Descriptor
(
infiniDtype_t
dtype
,
op
::
elementwise
::
ElementwiseInfo
info
,
op
::
elementwise
::
metax
::
DeviceImpl
*
device_info
,
size_t
workspace_size
,
infiniDevice_t
device_type
,
int
device_id
,
float
min_val
,
float
max_val
);
public:
~
Descriptor
();
size_t
workspaceSize
()
const
{
return
_workspace_size
;
}
static
infiniStatus_t
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
,
float
min_val
,
float
max_val
);
infiniStatus_t
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
;
};
}
// namespace op::hardtanh::metax
#endif // __HARDTANH_METAX_API_H__
src/infiniop/ops/hardtanh/metax/hardtanh_metax.maca
deleted
100644 → 0
View file @
6ab911c3
#include "hardtanh_metax.h"
#include "../../../elementwise/metax/elementwise_metax.h"
#include "../cuda/kernel.cuh"
namespace op::hardtanh::metax {
Descriptor::Descriptor(infiniDtype_t dtype,
op::elementwise::ElementwiseInfo info,
op::elementwise::metax::DeviceImpl *device_info,
size_t workspace_size,
infiniDevice_t device_type,
int device_id,
float min_val,
float max_val)
: InfiniopDescriptor{device_type, device_id},
_dtype(dtype),
_info(std::move(info)),
_device_info(device_info),
_workspace_size(workspace_size),
_min_val(min_val),
_max_val(max_val) {}
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec,
float min_val,
float max_val) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &input_desc = input_desc_vec.at(0);
const auto &output_shape = out_desc->shape();
const auto &input_shape = input_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_SAME_SHAPE(output_shape, input_shape);
auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec);
CHECK_RESULT(info_result);
auto info = info_result.take();
auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *);
auto device_impl_result = op::elementwise::metax::DeviceImpl::create(handle->internal());
CHECK_RESULT(device_impl_result);
*desc_ptr = new Descriptor(
dtype,
std::move(info),
device_impl_result.take(),
workspace_size,
handle->device,
handle->device_id,
min_val,
max_val);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::HardTanhOp, cuda_bfloat16>(
_info, workspace, output, inputs, stream, _min_val, _max_val);
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::HardTanhOp, half>(
_info, workspace, output, inputs, stream, _min_val, _max_val);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::HardTanhOp, float>(
_info, workspace, output, inputs, stream, _min_val, _max_val);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::HardTanhOp, double>(
_info, workspace, output, inputs, stream, _min_val, _max_val);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
} // namespace op::hardtanh::metax
src/infiniop/ops/hardtanh/moore/hardtanh_moore.h
deleted
100644 → 0
View file @
6ab911c3
#ifndef __HARDTANH_MOORE_API_H__
#define __HARDTANH_MOORE_API_H__
#include "../../../elementwise/moore/elementwise_moore_api.h"
namespace
op
::
hardtanh
::
moore
{
class
Descriptor
final
:
public
InfiniopDescriptor
{
infiniDtype_t
_dtype
;
op
::
elementwise
::
ElementwiseInfo
_info
;
std
::
unique_ptr
<
op
::
elementwise
::
moore
::
DeviceImpl
>
_device_info
;
size_t
_workspace_size
;
float
_min_val
;
float
_max_val
;
Descriptor
(
infiniDtype_t
dtype
,
op
::
elementwise
::
ElementwiseInfo
info
,
op
::
elementwise
::
moore
::
DeviceImpl
*
device_info
,
size_t
workspace_size
,
infiniDevice_t
device_type
,
int
device_id
,
float
min_val
,
float
max_val
);
public:
~
Descriptor
();
size_t
workspaceSize
()
const
{
return
_workspace_size
;
}
static
infiniStatus_t
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
,
float
min_val
,
float
max_val
);
infiniStatus_t
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
;
float
minVal
()
const
{
return
_min_val
;
}
float
maxVal
()
const
{
return
_max_val
;
}
};
}
// namespace op::hardtanh::moore
#endif // __HARDTANH_MOORE_API_H__
src/infiniop/ops/hardtanh/moore/hardtanh_moore.mu
deleted
100644 → 0
View file @
6ab911c3
#include "hardtanh_moore.h"
#include "../../../elementwise/moore/elementwise_moore.h"
#include "hardtanh_moore_kernel.h"
namespace op::hardtanh::moore {
namespace {
inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) {
return info.isOutputContiguous() && info.getInputSize() == 1 &&
info.getInputContiguous()[0] && !info.getInputBroadcasted()[0];
}
template <typename T>
INFINIOP_MOORE_KERNEL hardtanh_contiguous_kernel(size_t numel,
T *out,
const T *in,
float min_val,
float max_val) {
const auto op = op::hardtanh::moore::HardTanhOp{};
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
size_t stride = blockDim.x * gridDim.x;
for (; idx < numel; idx += stride) {
out[idx] = op(in[idx], min_val, max_val);
}
}
template <typename T>
infiniStatus_t launch_fast_path(size_t numel,
void *output,
const std::vector<const void *> &inputs,
void *stream,
float min_val,
float max_val) {
if (numel == 0) {
return INFINI_STATUS_SUCCESS;
}
constexpr int kBlockSize = 256;
int grid = static_cast<int>((numel + kBlockSize - 1) / kBlockSize);
if (grid > 65535) {
grid = 65535;
}
auto musa_stream = reinterpret_cast<musaStream_t>(stream);
hardtanh_contiguous_kernel<T><<<grid, kBlockSize, 0, musa_stream>>>(
numel,
reinterpret_cast<T *>(output),
reinterpret_cast<const T *>(inputs[0]),
min_val,
max_val);
return INFINI_STATUS_SUCCESS;
}
} // namespace
Descriptor::Descriptor(infiniDtype_t dtype,
op::elementwise::ElementwiseInfo info,
op::elementwise::moore::DeviceImpl *device_info,
size_t workspace_size,
infiniDevice_t device_type,
int device_id,
float min_val,
float max_val)
: InfiniopDescriptor{device_type, device_id},
_dtype(dtype),
_info(std::move(info)),
_device_info(device_info),
_workspace_size(workspace_size),
_min_val(min_val),
_max_val(max_val) {}
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec,
float min_val,
float max_val) {
auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &input_desc = input_desc_vec.at(0);
const auto &output_shape = out_desc->shape();
const auto &input_shape = input_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_SAME_SHAPE(output_shape, input_shape);
auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec);
CHECK_RESULT(info_result);
auto info = info_result.take();
auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *);
auto device_impl_result = op::elementwise::moore::DeviceImpl::create(handle->internal());
CHECK_RESULT(device_impl_result);
*desc_ptr = new Descriptor(
dtype,
std::move(info),
device_impl_result.take(),
workspace_size,
handle->device,
handle->device_id,
min_val,
max_val);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
const bool fast_path = can_use_contiguous_fast_path(_info);
if (fast_path) {
switch (_dtype) {
case INFINI_DTYPE_BF16:
return launch_fast_path<cuda_bfloat16>(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val);
case INFINI_DTYPE_F16:
return launch_fast_path<half>(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val);
case INFINI_DTYPE_F32:
return launch_fast_path<float>(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val);
case INFINI_DTYPE_F64:
return launch_fast_path<double>(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val);
default:
break;
}
}
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, moore::HardTanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream, _min_val, _max_val);
case INFINI_DTYPE_F16:
return _device_info->calculate<256, moore::HardTanhOp, half>(_info, workspace, output, inputs, stream, _min_val, _max_val);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, moore::HardTanhOp, float>(_info, workspace, output, inputs, stream, _min_val, _max_val);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, moore::HardTanhOp, double>(_info, workspace, output, inputs, stream, _min_val, _max_val);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::hardtanh::moore
src/infiniop/ops/hardtanh/moore/hardtanh_moore_kernel.h
deleted
100644 → 0
View file @
6ab911c3
#ifndef __HARDTANH_MOORE_KERNEL_H__
#define __HARDTANH_MOORE_KERNEL_H__
#include <cmath>
#include <type_traits>
namespace
op
::
hardtanh
::
moore
{
typedef
struct
HardTanhOp
{
public:
static
constexpr
size_t
num_inputs
=
1
;
template
<
typename
T
>
__device__
__forceinline__
T
operator
()(
const
T
&
x
,
float
min_val
,
float
max_val
)
const
{
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
float
x_f
=
__half2float
(
x
);
return
__float2half
(
fminf
(
max_val
,
fmaxf
(
min_val
,
x_f
)));
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
cuda_bfloat16
>
)
{
float
x_f
=
__bfloat162float
(
x
);
return
__float2bfloat16_rn
(
fminf
(
max_val
,
fmaxf
(
min_val
,
x_f
)));
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
return
fminf
(
max_val
,
fmaxf
(
min_val
,
x
));
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
double
>
)
{
return
fmin
((
double
)
max_val
,
fmax
((
double
)
min_val
,
x
));
}
else
{
float
x_f
=
static_cast
<
float
>
(
x
);
return
static_cast
<
T
>
(
fminf
(
max_val
,
fmaxf
(
min_val
,
x_f
)));
}
}
}
HardTanhOp
;
}
// namespace op::hardtanh::moore
#endif // __HARDTANH_MOORE_KERNEL_H__
src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cu
deleted
100644 → 0
View file @
6ab911c3
#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
#include "../cuda/kernel.cuh"
#include "hardtanh_nvidia.cuh"
#include <cuda_runtime.h>
namespace
op
::
hardtanh
::
nvidia
{
namespace
{
inline
bool
can_use_contiguous_fast_path
(
const
op
::
elementwise
::
ElementwiseInfo
&
info
)
{
return
info
.
isOutputContiguous
()
&&
info
.
getInputSize
()
==
1
&&
info
.
getInputContiguous
()[
0
]
&&
!
info
.
getInputBroadcasted
()[
0
];
}
template
<
typename
T
>
__global__
void
hardtanh_contiguous_kernel
(
size_t
numel
,
T
*
out
,
const
T
*
in
,
float
min_val
,
float
max_val
)
{
const
auto
op
=
op
::
hardtanh
::
cuda
::
HardTanhOp
{};
size_t
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
while
(
idx
<
numel
)
{
out
[
idx
]
=
op
(
in
[
idx
],
min_val
,
max_val
);
idx
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
template
<
typename
T
>
infiniStatus_t
launch_fast_path
(
size_t
numel
,
void
*
output
,
const
std
::
vector
<
const
void
*>
&
inputs
,
void
*
stream
,
float
min_val
,
float
max_val
)
{
if
(
numel
==
0
)
{
return
INFINI_STATUS_SUCCESS
;
}
constexpr
int
BLOCK_SIZE
=
256
;
int
grid
=
static_cast
<
int
>
((
numel
+
BLOCK_SIZE
-
1
)
/
BLOCK_SIZE
);
grid
=
std
::
min
(
grid
,
65535
);
auto
*
out_ptr
=
reinterpret_cast
<
T
*>
(
output
);
auto
*
in_ptr
=
reinterpret_cast
<
const
T
*>
(
inputs
[
0
]);
auto
cuda_stream
=
reinterpret_cast
<
cudaStream_t
>
(
stream
);
hardtanh_contiguous_kernel
<<<
grid
,
BLOCK_SIZE
,
0
,
cuda_stream
>>>
(
numel
,
out_ptr
,
in_ptr
,
min_val
,
max_val
);
cudaError_t
err
=
cudaGetLastError
();
return
err
==
cudaSuccess
?
INFINI_STATUS_SUCCESS
:
INFINI_STATUS_INTERNAL_ERROR
;
}
}
// namespace
Descriptor
::
Descriptor
(
infiniDtype_t
dtype
,
op
::
elementwise
::
ElementwiseInfo
info
,
op
::
elementwise
::
nvidia
::
DeviceImpl
*
device_info
,
size_t
workspace_size
,
infiniDevice_t
device_type
,
int
device_id
,
float
min_val
,
float
max_val
)
:
InfiniopDescriptor
{
device_type
,
device_id
},
_dtype
(
dtype
),
_info
(
std
::
move
(
info
)),
_device_info
(
device_info
),
_workspace_size
(
workspace_size
),
_min_val
(
min_val
),
_max_val
(
max_val
)
{}
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
,
float
min_val
,
float
max_val
)
{
auto
handle
=
reinterpret_cast
<
device
::
nvidia
::
Handle
*>
(
handle_
);
auto
dtype
=
out_desc
->
dtype
();
const
auto
&
input_desc
=
input_desc_vec
.
at
(
0
);
const
auto
&
output_shape
=
out_desc
->
shape
();
const
auto
&
input_shape
=
input_desc
->
shape
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_BF16
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
);
CHECK_SAME_SHAPE
(
output_shape
,
input_shape
);
auto
info_result
=
op
::
elementwise
::
ElementwiseInfo
::
create
(
out_desc
,
input_desc_vec
);
CHECK_RESULT
(
info_result
);
auto
info
=
info_result
.
take
();
auto
workspace_size
=
info
.
getMetaMemSize
()
+
info
.
getInputSize
()
*
sizeof
(
void
*
);
auto
device_impl_result
=
op
::
elementwise
::
nvidia
::
DeviceImpl
::
create
(
handle
->
internal
());
CHECK_RESULT
(
device_impl_result
);
*
desc_ptr
=
new
Descriptor
(
dtype
,
std
::
move
(
info
),
device_impl_result
.
take
(),
workspace_size
,
handle
->
device
,
handle
->
device_id
,
min_val
,
max_val
);
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
{
const
bool
fast_path
=
can_use_contiguous_fast_path
(
_info
);
if
(
fast_path
)
{
switch
(
_dtype
)
{
case
INFINI_DTYPE_BF16
:
return
launch_fast_path
<
cuda_bfloat16
>
(
_info
.
getOutputSize
(),
output
,
inputs
,
stream
,
_min_val
,
_max_val
);
case
INFINI_DTYPE_F16
:
return
launch_fast_path
<
half
>
(
_info
.
getOutputSize
(),
output
,
inputs
,
stream
,
_min_val
,
_max_val
);
case
INFINI_DTYPE_F32
:
return
launch_fast_path
<
float
>
(
_info
.
getOutputSize
(),
output
,
inputs
,
stream
,
_min_val
,
_max_val
);
case
INFINI_DTYPE_F64
:
return
launch_fast_path
<
double
>
(
_info
.
getOutputSize
(),
output
,
inputs
,
stream
,
_min_val
,
_max_val
);
default:
break
;
}
}
if
(
workspace_size
<
_workspace_size
)
{
return
INFINI_STATUS_INSUFFICIENT_WORKSPACE
;
}
switch
(
_dtype
)
{
case
INFINI_DTYPE_BF16
:
return
_device_info
->
calculate
<
256
,
cuda
::
HardTanhOp
,
cuda_bfloat16
>
(
_info
,
workspace
,
output
,
inputs
,
stream
,
_min_val
,
_max_val
);
case
INFINI_DTYPE_F16
:
return
_device_info
->
calculate
<
256
,
cuda
::
HardTanhOp
,
half
>
(
_info
,
workspace
,
output
,
inputs
,
stream
,
_min_val
,
_max_val
);
case
INFINI_DTYPE_F32
:
return
_device_info
->
calculate
<
256
,
cuda
::
HardTanhOp
,
float
>
(
_info
,
workspace
,
output
,
inputs
,
stream
,
_min_val
,
_max_val
);
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
256
,
cuda
::
HardTanhOp
,
double
>
(
_info
,
workspace
,
output
,
inputs
,
stream
,
_min_val
,
_max_val
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::hardtanh::nvidia
src/infiniop/ops/hardtanh/nvidia/hardtanh_nvidia.cuh
deleted
100644 → 0
View file @
6ab911c3
#ifndef __HARDTANH_CUDA_API_H__
#define __HARDTANH_CUDA_API_H__
#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
namespace
op
::
hardtanh
::
nvidia
{
class
Descriptor
final
:
public
InfiniopDescriptor
{
infiniDtype_t
_dtype
;
op
::
elementwise
::
ElementwiseInfo
_info
;
std
::
unique_ptr
<
op
::
elementwise
::
nvidia
::
DeviceImpl
>
_device_info
;
size_t
_workspace_size
;
float
_min_val
;
float
_max_val
;
Descriptor
(
infiniDtype_t
dtype
,
op
::
elementwise
::
ElementwiseInfo
info
,
op
::
elementwise
::
nvidia
::
DeviceImpl
*
device_info
,
size_t
workspace_size
,
infiniDevice_t
device_type
,
int
device_id
,
float
min_val
,
float
max_val
);
public:
~
Descriptor
();
size_t
workspaceSize
()
const
{
return
_workspace_size
;
}
static
infiniStatus_t
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
,
float
min_val
,
float
max_val
);
infiniStatus_t
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
;
float
minVal
()
const
{
return
_min_val
;
}
float
maxVal
()
const
{
return
_max_val
;
}
};
}
// namespace op::hardtanh::nvidia
#endif
src/infiniop/ops/hardtanh/operator.cc
deleted
100644 → 0
View file @
6ab911c3
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/hardtanh.h"
#ifdef ENABLE_CPU_API
#include "cpu/hardtanh_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#include "nvidia/hardtanh_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
#include "metax/hardtanh_metax.h"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/hardtanh_moore.h"
#endif
__INFINI_C
infiniStatus_t
infiniopCreateHardTanhDescriptor
(
infiniopHandle_t
handle
,
infiniopHardTanhDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
output_desc
,
infiniopTensorDescriptor_t
input_desc
,
float
min_val
,
float
max_val
)
{
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::hardtanh::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::hardtanh::NAMESPACE::Descriptor **>(desc_ptr), \
output_desc, \
{input_desc}, \
min_val, \
max_val)
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_MOORE_API
CREATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CREATE
}
__INFINI_C
infiniStatus_t
infiniopGetHardTanhWorkspaceSize
(
infiniopHardTanhDescriptor_t
desc
,
size_t
*
size
)
{
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::hardtanh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
GET
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_MOORE_API
GET
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef GET
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
__INFINI_C
infiniStatus_t
infiniopHardTanh
(
infiniopHardTanhDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
const
void
*
input
,
void
*
stream
)
{
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::hardtanh::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, output, {input}, stream)
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CALCULATE
}
__INFINI_C
infiniStatus_t
infiniopDestroyHardTanhDescriptor
(
infiniopHardTanhDescriptor_t
desc
)
{
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::hardtanh::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_MOORE_API
DELETE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef DELETE
}
src/infiniop/ops/sum/cpu/sum_cpu.cc
deleted
100644 → 0
View file @
6ab911c3
#include "sum_cpu.h"
#include "../../../../utils.h"
#include "../../../devices/cpu/common_cpu.h"
namespace
op
::
sum
::
cpu
{
Descriptor
::~
Descriptor
()
{}
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
output_desc
,
infiniopTensorDescriptor_t
input_desc
,
size_t
*
dim
,
size_t
dim_size
,
bool
keepdim
)
{
auto
result
=
SumInfo
::
create
(
output_desc
,
input_desc
,
dim
,
dim_size
,
keepdim
);
CHECK_RESULT
(
result
);
*
desc_ptr
=
new
Descriptor
(
nullptr
,
result
.
take
(),
0
,
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
namespace
{
template
<
typename
T
>
infiniStatus_t
calculateSum
(
const
SumInfo
*
info
,
T
*
output
,
const
T
*
input
)
{
if
(
info
->
reduce_dim_size
==
info
->
permuted_input_shape
.
size
())
{
// 规约到标量
float
tempSum
=
0.
;
for
(
size_t
index
=
0
;
index
<
info
->
input_size
;
index
++
)
{
size_t
input_offset
=
op
::
common_cpu
::
indexToOffset
(
index
,
info
->
permuted_input_shape
.
size
(),
info
->
permuted_input_shape
.
data
(),
info
->
permuted_input_strides
.
data
());
tempSum
+=
utils
::
cast
<
float
>
(
input
[
input_offset
]);
}
output
[
0
]
=
utils
::
cast
<
T
>
(
tempSum
);
return
INFINI_STATUS_SUCCESS
;
}
else
{
for
(
size_t
i
=
0
;
i
<
info
->
output_size
;
i
++
)
{
size_t
output_offset
=
op
::
common_cpu
::
indexToOffset
(
i
,
info
->
output_shape
.
size
(),
info
->
output_shape
.
data
(),
info
->
output_strides
.
data
());
float
tempSum
=
0.
;
for
(
size_t
j
=
0
;
j
<
info
->
reduce_num
;
j
++
)
{
size_t
input_offset
=
op
::
common_cpu
::
indexToOffset
(
j
+
i
*
info
->
reduce_num
,
info
->
permuted_input_shape
.
size
(),
info
->
permuted_input_shape
.
data
(),
info
->
permuted_input_strides
.
data
());
tempSum
+=
utils
::
cast
<
float
>
(
input
[
input_offset
]);
}
output
[
output_offset
]
=
utils
::
cast
<
T
>
(
tempSum
);
}
return
INFINI_STATUS_SUCCESS
;
}
}
}
// namespace
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
const
void
*
input
,
void
*
stream
)
const
{
switch
(
_info
.
dtype
)
{
case
INFINI_DTYPE_F16
:
return
calculateSum
<
fp16_t
>
(
&
_info
,
(
fp16_t
*
)
output
,
reinterpret_cast
<
const
fp16_t
*>
(
input
));
case
INFINI_DTYPE_F32
:
return
calculateSum
<
float
>
(
&
_info
,
(
float
*
)
output
,
reinterpret_cast
<
const
float
*>
(
input
));
case
INFINI_DTYPE_BF16
:
return
calculateSum
<
bf16_t
>
(
&
_info
,
(
bf16_t
*
)
output
,
reinterpret_cast
<
const
bf16_t
*>
(
input
));
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::sum::cpu
src/infiniop/ops/sum/cpu/sum_cpu.h
deleted
100644 → 0
View file @
6ab911c3
#ifndef __INFINIOP_SUM_CPU_H__
#define __INFINIOP_SUM_CPU_H__
#include "../sum_desc.h"
DESCRIPTOR
(
cpu
);
#endif // __INFINIOP_SUM_CPU_H__
src/infiniop/ops/sum/cuda/kernel.cuh
deleted
100644 → 0
View file @
6ab911c3
#ifndef __SUM_CUDA_H__
#define __SUM_CUDA_H__
__forceinline__
__device__
__host__
size_t
indexToOffset
(
size_t
flat_index
,
size_t
ndim
,
const
size_t
*
shape
,
const
ptrdiff_t
*
strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
ndim
;
i
--
>
0
;)
{
res
+=
(
flat_index
%
shape
[
i
])
*
strides
[
i
];
flat_index
/=
shape
[
i
];
}
return
res
;
}
template
<
size_t
BLOCK_SIZE
,
typename
Tdata
,
typename
Tcompute
>
__global__
void
sumAllKernel
(
Tcompute
*
output
,
const
Tdata
*
input
,
size_t
input_size
,
size_t
permuted_input_shape_size
,
size_t
*
permuted_input_shape
,
ptrdiff_t
*
permuted_input_strides
)
{
__shared__
Tcompute
s_data
[
BLOCK_SIZE
];
size_t
tid
=
threadIdx
.
x
;
size_t
idx
=
tid
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx
<
input_size
)
{
size_t
input_offset
=
indexToOffset
(
idx
,
permuted_input_shape_size
,
permuted_input_shape
,
permuted_input_strides
);
s_data
[
tid
]
=
static_cast
<
Tcompute
>
(
input
[
input_offset
]);
}
else
{
s_data
[
tid
]
=
static_cast
<
Tcompute
>
(
0.
f
);
}
__syncthreads
();
for
(
size_t
s
=
blockDim
.
x
/
2
;
s
>
0
;
s
>>=
1
)
{
if
(
tid
<
s
)
{
s_data
[
tid
]
+=
s_data
[
tid
+
s
];
}
__syncthreads
();
}
if
(
tid
==
0
)
{
atomicAdd
(
output
,
s_data
[
0
]);
}
}
template
<
size_t
BLOCK_SIZE
,
typename
T
>
__global__
void
sumKernel
(
T
*
output
,
const
T
*
input
,
size_t
permuted_input_shape_size
,
size_t
output_shape_size
,
size_t
output_size
,
size_t
reduce_num
,
size_t
*
permuted_input_shape
,
size_t
*
output_shape
,
ptrdiff_t
*
permuted_input_strides
,
ptrdiff_t
*
output_strides
)
{
size_t
tid
=
threadIdx
.
x
;
size_t
idx
=
tid
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx
>=
output_size
)
{
return
;
}
size_t
output_index
=
indexToOffset
(
idx
,
output_shape_size
,
output_shape
,
output_strides
);
float
tempSum
=
static_cast
<
float
>
(
0.
f
);
for
(
size_t
i
=
0
;
i
<
reduce_num
;
i
++
)
{
size_t
input_offset
=
indexToOffset
(
i
+
idx
*
reduce_num
,
permuted_input_shape_size
,
permuted_input_shape
,
permuted_input_strides
);
tempSum
+=
static_cast
<
float
>
(
input
[
input_offset
]);
}
output
[
output_index
]
=
static_cast
<
T
>
(
tempSum
);
}
#endif // __SUM_CUDA_H__
Prev
1
…
3
4
5
6
7
8
9
10
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment