Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
45a3794b
Commit
45a3794b
authored
Mar 11, 2026
by
wooway777
Browse files
issue/1031 T1-1-17
parent
cb7f0b7d
Changes
108
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1549 additions
and
0 deletions
+1549
-0
src/infiniop/ops/avg_pool1d/cuda/kernel.cuh
src/infiniop/ops/avg_pool1d/cuda/kernel.cuh
+58
-0
src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.h
src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.h
+8
-0
src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.maca
src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.maca
+170
-0
src/infiniop/ops/avg_pool1d/moore/avg_pool1d_kernel.h
src/infiniop/ops/avg_pool1d/moore/avg_pool1d_kernel.h
+72
-0
src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.h
src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.h
+8
-0
src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.mu
src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.mu
+135
-0
src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cu
src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cu
+126
-0
src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cuh
src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cuh
+8
-0
src/infiniop/ops/avg_pool1d/operator.cc
src/infiniop/ops/avg_pool1d/operator.cc
+225
-0
src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.cc
src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.cc
+99
-0
src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.h
src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.h
+8
-0
src/infiniop/ops/cross_entropy/cross_entropy.h
src/infiniop/ops/cross_entropy/cross_entropy.h
+42
-0
src/infiniop/ops/cross_entropy/cuda/kernel.cuh
src/infiniop/ops/cross_entropy/cuda/kernel.cuh
+80
-0
src/infiniop/ops/cross_entropy/info.h
src/infiniop/ops/cross_entropy/info.h
+17
-0
src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.h
src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.h
+8
-0
src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.maca
...infiniop/ops/cross_entropy/metax/cross_entropy_metax.maca
+188
-0
src/infiniop/ops/cross_entropy/moore/cross_entropy_kernel.h
src/infiniop/ops/cross_entropy/moore/cross_entropy_kernel.h
+53
-0
src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.h
src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.h
+8
-0
src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.mu
src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.mu
+129
-0
src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cu
...infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cu
+107
-0
No files found.
src/infiniop/ops/avg_pool1d/cuda/kernel.cuh
0 → 100644
View file @
45a3794b
#ifndef __INFINIOP_AVG_POOL1D_CUDA_KERNEL_CUH__
#define __INFINIOP_AVG_POOL1D_CUDA_KERNEL_CUH__
template
<
typename
T
>
__device__
void
avgPool1dKernel
(
T
*
y
,
const
T
*
x
,
size_t
batch
,
size_t
channels
,
size_t
in_width
,
size_t
out_width
,
size_t
kernel_size
,
size_t
stride
,
size_t
padding
,
ptrdiff_t
y_stride_batch
,
ptrdiff_t
y_stride_channel
,
ptrdiff_t
y_stride_width
,
ptrdiff_t
x_stride_batch
,
ptrdiff_t
x_stride_channel
,
ptrdiff_t
x_stride_width
)
{
size_t
total_elements
=
batch
*
channels
*
out_width
;
for
(
size_t
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
idx
<
total_elements
;
idx
+=
gridDim
.
x
*
blockDim
.
x
)
{
size_t
ow
=
idx
%
out_width
;
size_t
temp
=
idx
/
out_width
;
size_t
c
=
temp
%
channels
;
size_t
b
=
temp
/
channels
;
size_t
y_offset
=
b
*
y_stride_batch
+
c
*
y_stride_channel
+
ow
*
y_stride_width
;
long
long
start_w
=
static_cast
<
long
long
>
(
ow
*
stride
)
-
padding
;
T
sum
=
0
;
for
(
size_t
k
=
0
;
k
<
kernel_size
;
++
k
)
{
long
long
iw
=
start_w
+
k
;
if
(
iw
>=
0
&&
iw
<
static_cast
<
long
long
>
(
in_width
))
{
size_t
x_offset
=
b
*
x_stride_batch
+
c
*
x_stride_channel
+
iw
*
x_stride_width
;
sum
+=
x
[
x_offset
];
}
}
#if defined(ENABLE_ILUVATAR_API)
// Iluvatar __half doesn't accept size_t directly.
y
[
y_offset
]
=
sum
/
static_cast
<
T
>
(
static_cast
<
double
>
(
kernel_size
));
#else
y
[
y_offset
]
=
sum
/
static_cast
<
T
>
(
kernel_size
);
#endif
}
}
#endif
src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.h
0 → 100644
View file @
45a3794b
#ifndef __INFINIOP_AVG_POOL1D_METAX_H__
#define __INFINIOP_AVG_POOL1D_METAX_H__
#include "../avg_pool1d.h"
DESCRIPTOR
(
metax
)
#endif // __INFINIOP_AVG_POOL1D_METAX_H__
src/infiniop/ops/avg_pool1d/metax/avg_pool1d_metax.maca
0 → 100644
View file @
45a3794b
#include "../../../devices/metax/metax_common.h"
#include "avg_pool1d_metax.h"
#include "../../../devices/metax/metax_kernel_common.h"
#include <type_traits>
namespace op::avg_pool1d::metax {
struct Descriptor::Opaque {
std::shared_ptr<device::metax::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
size_t kernel_size,
size_t stride,
size_t padding) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding);
CHECK_RESULT(info);
*desc_ptr = new Descriptor(
info.take(),
0,
new Opaque{handle->internal()},
handle->device,
handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <typename Tdata, typename Tcompute>
__device__ __forceinline__ Tdata castToOutput(Tcompute val) {
if constexpr (std::is_same_v<Tdata, half>) {
return __float2half(static_cast<float>(val));
} else if constexpr (std::is_same_v<Tdata, cuda_bfloat16>) {
return __float2bfloat16(static_cast<float>(val));
} else {
return static_cast<Tdata>(val);
}
}
template <typename Tdata, typename Tcompute>
INFINIOP_METAX_KERNEL avgPool1dGlobalKernel(
Tdata *y,
const Tdata *x,
size_t batch,
size_t channels,
size_t in_width,
size_t out_width,
size_t kernel_size,
size_t stride,
size_t padding,
ptrdiff_t y_stride_batch,
ptrdiff_t y_stride_channel,
ptrdiff_t y_stride_width,
ptrdiff_t x_stride_batch,
ptrdiff_t x_stride_channel,
ptrdiff_t x_stride_width) {
size_t total_elements = batch * channels * out_width;
Tcompute inv_kernel = Tcompute(1) / static_cast<Tcompute>(kernel_size);
for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
idx < total_elements;
idx += gridDim.x * blockDim.x) {
size_t ow = idx % out_width;
size_t temp = idx / out_width;
size_t c = temp % channels;
size_t b = temp / channels;
size_t y_offset = b * y_stride_batch + c * y_stride_channel + ow * y_stride_width;
size_t x_base = b * x_stride_batch + c * x_stride_channel;
long long start_w = static_cast<long long>(ow * stride) - static_cast<long long>(padding);
long long end_w = start_w + static_cast<long long>(kernel_size);
long long iw_start = start_w < 0 ? 0 : start_w;
long long iw_end = end_w > static_cast<long long>(in_width) ? static_cast<long long>(in_width) : end_w;
Tcompute sum = Tcompute(0);
if (iw_start < iw_end) {
size_t x_offset = x_base + static_cast<size_t>(iw_start) * x_stride_width;
for (long long iw = iw_start; iw < iw_end; ++iw) {
sum += static_cast<Tcompute>(x[x_offset]);
x_offset += x_stride_width;
}
}
y[y_offset] = castToOutput<Tdata, Tcompute>(sum * inv_kernel);
}
}
template <typename Tdata, typename Tcompute>
infiniStatus_t calculateAvgPool1d(
const AvgPool1dInfo &info,
int max_threads_per_block,
Tdata *y,
const Tdata *x,
hcStream_t stream) {
size_t total_elements = info.batch * info.channels * info.out_width;
int block_size = 256;
if (max_threads_per_block > 0 && max_threads_per_block < block_size) {
block_size = max_threads_per_block;
}
size_t grid_size = (total_elements + block_size - 1) / block_size;
if (grid_size > 65535) {
grid_size = 65535;
}
avgPool1dGlobalKernel<Tdata, Tcompute><<<grid_size, block_size, 0, stream>>>(
y, x,
info.batch, info.channels, info.in_width, info.out_width,
info.kernel_size, info.stride, info.padding,
info.y_stride_batch, info.y_stride_channel, info.y_stride_width,
info.x_stride_batch, info.x_stride_channel, info.x_stride_width);
return INFINI_STATUS_SUCCESS;
}
#define CALCULATE(TDATA, TCOMPUTE) \
calculateAvgPool1d<TDATA, TCOMPUTE>( \
_info, \
_opaque->internal->maxThreadsPerBlock(), \
(TDATA *)y, \
(const TDATA *)x, \
(hcStream_t)stream)
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *y,
const void *x,
void *stream) const {
(void)workspace;
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_info.dtype) {
case INFINI_DTYPE_F16:
return CALCULATE(half, float);
case INFINI_DTYPE_BF16:
return CALCULATE(cuda_bfloat16, float);
case INFINI_DTYPE_F32:
return CALCULATE(float, float);
case INFINI_DTYPE_F64:
return CALCULATE(double, double);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
#undef CALCULATE
} // namespace op::avg_pool1d::metax
src/infiniop/ops/avg_pool1d/moore/avg_pool1d_kernel.h
0 → 100644
View file @
45a3794b
#ifndef __INFINIOP_AVG_POOL1D_MOORE_KERNEL_H__
#define __INFINIOP_AVG_POOL1D_MOORE_KERNEL_H__
#include <type_traits>
namespace
op
::
avg_pool1d
::
moore
{
template
<
typename
Tdata
,
typename
Tcompute
>
__device__
__forceinline__
Tdata
castToOutput
(
Tcompute
val
)
{
if
constexpr
(
std
::
is_same_v
<
Tdata
,
half
>
)
{
return
__float2half
(
static_cast
<
float
>
(
val
));
}
else
if
constexpr
(
std
::
is_same_v
<
Tdata
,
cuda_bfloat16
>
)
{
return
__float2bfloat16_rn
(
static_cast
<
float
>
(
val
));
}
else
{
return
static_cast
<
Tdata
>
(
val
);
}
}
template
<
typename
Tdata
,
typename
Tcompute
>
__device__
void
avgPool1dKernel
(
Tdata
*
y
,
const
Tdata
*
x
,
size_t
batch
,
size_t
channels
,
size_t
in_width
,
size_t
out_width
,
size_t
kernel_size
,
size_t
stride
,
size_t
padding
,
ptrdiff_t
y_stride_batch
,
ptrdiff_t
y_stride_channel
,
ptrdiff_t
y_stride_width
,
ptrdiff_t
x_stride_batch
,
ptrdiff_t
x_stride_channel
,
ptrdiff_t
x_stride_width
)
{
size_t
total_elements
=
batch
*
channels
*
out_width
;
Tcompute
inv_kernel
=
Tcompute
(
1
)
/
static_cast
<
Tcompute
>
(
kernel_size
);
for
(
size_t
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
idx
<
total_elements
;
idx
+=
gridDim
.
x
*
blockDim
.
x
)
{
size_t
ow
=
idx
%
out_width
;
size_t
temp
=
idx
/
out_width
;
size_t
c
=
temp
%
channels
;
size_t
b
=
temp
/
channels
;
size_t
y_offset
=
b
*
y_stride_batch
+
c
*
y_stride_channel
+
ow
*
y_stride_width
;
size_t
x_base
=
b
*
x_stride_batch
+
c
*
x_stride_channel
;
long
long
start_w
=
static_cast
<
long
long
>
(
ow
*
stride
)
-
static_cast
<
long
long
>
(
padding
);
long
long
end_w
=
start_w
+
static_cast
<
long
long
>
(
kernel_size
);
long
long
iw_start
=
start_w
<
0
?
0
:
start_w
;
long
long
iw_end
=
end_w
>
static_cast
<
long
long
>
(
in_width
)
?
static_cast
<
long
long
>
(
in_width
)
:
end_w
;
Tcompute
sum
=
Tcompute
(
0
);
if
(
iw_start
<
iw_end
)
{
size_t
x_offset
=
x_base
+
static_cast
<
size_t
>
(
iw_start
)
*
x_stride_width
;
for
(
long
long
iw
=
iw_start
;
iw
<
iw_end
;
++
iw
)
{
sum
+=
static_cast
<
Tcompute
>
(
x
[
x_offset
]);
x_offset
+=
x_stride_width
;
}
}
y
[
y_offset
]
=
castToOutput
<
Tdata
,
Tcompute
>
(
sum
*
inv_kernel
);
}
}
}
// namespace op::avg_pool1d::moore
#endif // __INFINIOP_AVG_POOL1D_MOORE_KERNEL_H__
src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.h
0 → 100644
View file @
45a3794b
#ifndef __INFINIOP_AVG_POOL1D_MOORE_H__
#define __INFINIOP_AVG_POOL1D_MOORE_H__
#include "../avg_pool1d.h"
DESCRIPTOR
(
moore
)
#endif // __INFINIOP_AVG_POOL1D_MOORE_H__
src/infiniop/ops/avg_pool1d/moore/avg_pool1d_moore.mu
0 → 100644
View file @
45a3794b
#include "../../../devices/moore/moore_common.h"
#include "avg_pool1d_moore.h"
#include "../../../devices/moore/moore_kernel_common.h"
#include "avg_pool1d_kernel.h"
namespace op::avg_pool1d::moore {
struct Descriptor::Opaque {
std::shared_ptr<device::moore::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
size_t kernel_size,
size_t stride,
size_t padding) {
auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding);
CHECK_RESULT(info);
*desc_ptr = new Descriptor(
info.take(),
0,
new Opaque{handle->internal()},
handle->device,
handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <typename Tdata, typename Tcompute>
INFINIOP_MOORE_KERNEL avgPool1dGlobalKernel(
Tdata *y,
const Tdata *x,
size_t batch,
size_t channels,
size_t in_width,
size_t out_width,
size_t kernel_size,
size_t stride,
size_t padding,
ptrdiff_t y_stride_batch,
ptrdiff_t y_stride_channel,
ptrdiff_t y_stride_width,
ptrdiff_t x_stride_batch,
ptrdiff_t x_stride_channel,
ptrdiff_t x_stride_width) {
avgPool1dKernel<Tdata, Tcompute>(
y, x,
batch, channels, in_width, out_width,
kernel_size, stride, padding,
y_stride_batch, y_stride_channel, y_stride_width,
x_stride_batch, x_stride_channel, x_stride_width);
}
template <typename Tdata, typename Tcompute>
infiniStatus_t calculateAvgPool1d(
const AvgPool1dInfo &info,
int max_threads_per_block,
Tdata *y,
const Tdata *x,
musaStream_t stream) {
size_t total_elements = info.batch * info.channels * info.out_width;
int block_size = 256;
if (max_threads_per_block > 0 && max_threads_per_block < block_size) {
block_size = max_threads_per_block;
}
size_t grid_size = (total_elements + block_size - 1) / block_size;
if (grid_size > 65535) {
grid_size = 65535;
}
avgPool1dGlobalKernel<Tdata, Tcompute><<<grid_size, block_size, 0, stream>>>(
y, x,
info.batch, info.channels, info.in_width, info.out_width,
info.kernel_size, info.stride, info.padding,
info.y_stride_batch, info.y_stride_channel, info.y_stride_width,
info.x_stride_batch, info.x_stride_channel, info.x_stride_width);
return INFINI_STATUS_SUCCESS;
}
#define CALCULATE(TDATA, TCOMPUTE) \
calculateAvgPool1d<TDATA, TCOMPUTE>(\
_info,\
_opaque->internal->maxThreadsPerBlock(),\
(TDATA *)y,\
(const TDATA *)x,\
(musaStream_t)stream)
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *y,
const void *x,
void *stream) const {
(void)workspace;
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_info.dtype) {
case INFINI_DTYPE_F16:
return CALCULATE(half, float);
case INFINI_DTYPE_BF16:
return CALCULATE(cuda_bfloat16, float);
case INFINI_DTYPE_F32:
return CALCULATE(float, float);
case INFINI_DTYPE_F64:
return CALCULATE(double, double);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
#undef CALCULATE
} // namespace op::avg_pool1d::moore
src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cu
0 → 100644
View file @
45a3794b
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include "../cuda/kernel.cuh"
#include "avg_pool1d_nvidia.cuh"
template
<
typename
T
>
__global__
void
avgPool1dGlobalKernel
(
T
*
y
,
const
T
*
x
,
size_t
batch
,
size_t
channels
,
size_t
in_width
,
size_t
out_width
,
size_t
kernel_size
,
size_t
stride
,
size_t
padding
,
ptrdiff_t
y_stride_batch
,
ptrdiff_t
y_stride_channel
,
ptrdiff_t
y_stride_width
,
ptrdiff_t
x_stride_batch
,
ptrdiff_t
x_stride_channel
,
ptrdiff_t
x_stride_width
)
{
avgPool1dKernel
<
T
>
(
y
,
x
,
batch
,
channels
,
in_width
,
out_width
,
kernel_size
,
stride
,
padding
,
y_stride_batch
,
y_stride_channel
,
y_stride_width
,
x_stride_batch
,
x_stride_channel
,
x_stride_width
);
}
namespace
op
::
avg_pool1d
::
nvidia
{
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
nvidia
::
Handle
::
Internal
>
internal
;
};
Descriptor
::~
Descriptor
()
{
delete
_opaque
;
}
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
,
size_t
kernel_size
,
size_t
stride
,
size_t
padding
)
{
auto
handle
=
reinterpret_cast
<
device
::
nvidia
::
Handle
*>
(
handle_
);
auto
info
=
AvgPool1dInfo
::
createAvgPool1dInfo
(
y_desc
,
x_desc
,
kernel_size
,
stride
,
padding
);
CHECK_RESULT
(
info
);
*
desc_ptr
=
new
Descriptor
(
info
.
take
(),
0
,
new
Opaque
{
reinterpret_cast
<
device
::
nvidia
::
Handle
*>
(
handle
)
->
internal
()},
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
template
<
typename
T
>
infiniStatus_t
calculateAvgPool1d
(
const
AvgPool1dInfo
&
info
,
int
max_threads_per_block
,
T
*
y
,
const
T
*
x
,
cudaStream_t
stream
)
{
size_t
total_elements
=
info
.
batch
*
info
.
channels
*
info
.
out_width
;
int
block_size
=
256
;
if
(
max_threads_per_block
>
0
&&
max_threads_per_block
<
256
)
{
block_size
=
max_threads_per_block
;
}
size_t
grid_size
=
(
total_elements
+
block_size
-
1
)
/
block_size
;
if
(
grid_size
>
65535
)
{
grid_size
=
65535
;
}
avgPool1dGlobalKernel
<
T
><<<
grid_size
,
block_size
,
0
,
stream
>>>
(
y
,
x
,
info
.
batch
,
info
.
channels
,
info
.
in_width
,
info
.
out_width
,
info
.
kernel_size
,
info
.
stride
,
info
.
padding
,
info
.
y_stride_batch
,
info
.
y_stride_channel
,
info
.
y_stride_width
,
info
.
x_stride_batch
,
info
.
x_stride_channel
,
info
.
x_stride_width
);
return
INFINI_STATUS_SUCCESS
;
}
#define CALCULATE(TDATA) \
calculateAvgPool1d(_info, \
_opaque->internal->maxThreadsPerBlock(), \
(TDATA *)y, \
(const TDATA *)x, \
(cudaStream_t)stream)
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
void
*
stream
)
const
{
switch
(
_info
.
dtype
)
{
case
INFINI_DTYPE_F16
:
return
CALCULATE
(
half
);
case
INFINI_DTYPE_BF16
:
return
CALCULATE
(
cuda_bfloat16
);
case
INFINI_DTYPE_F32
:
return
CALCULATE
(
float
);
case
INFINI_DTYPE_F64
:
return
CALCULATE
(
double
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
}
#undef CALCULATE
}
// namespace op::avg_pool1d::nvidia
src/infiniop/ops/avg_pool1d/nvidia/avg_pool1d_nvidia.cuh
0 → 100644
View file @
45a3794b
#ifndef __INFINIOP_AVG_POOL1D_CUDA_H__
#define __INFINIOP_AVG_POOL1D_CUDA_H__
#include "../avg_pool1d.h"
DESCRIPTOR
(
nvidia
)
#endif
src/infiniop/ops/avg_pool1d/operator.cc
0 → 100644
View file @
45a3794b
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/avg_pool1d.h"
#ifdef ENABLE_CPU_API
#include "cpu/avg_pool1d_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
#include "nvidia/avg_pool1d_nvidia.cuh"
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/avg_pool1d_ascend.h"
#endif
#ifdef ENABLE_CAMBRICON_API
#include "bang/avg_pool1d_bang.h"
#endif
#ifdef ENABLE_METAX_API
#include "metax/avg_pool1d_metax.h"
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/avg_pool1d_kunlun.h"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/avg_pool1d_moore.h"
#endif
__INFINI_C
infiniStatus_t
infiniopCreateAvgPool1dDescriptor
(
infiniopHandle_t
handle
,
infiniopAvgPool1dDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
y
,
infiniopTensorDescriptor_t
x
,
size_t
kernel_size
,
size_t
stride
,
size_t
padding
)
{
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::avg_pool1d::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::avg_pool1d::NAMESPACE::Descriptor **>(desc_ptr), \
y, \
x, \
kernel_size, \
stride, \
padding)
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
CREATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_HYGON_API
CREATE
(
INFINI_DEVICE_HYGON
,
nvidia
);
#endif
#ifdef ENABLE_MOORE_API
CREATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_ASCEND_API
CREATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_KUNLUN_API
CREATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_CAMBRICON_API
CREATE
(
INFINI_DEVICE_CAMBRICON
,
bang
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CREATE
}
__INFINI_C
infiniStatus_t
infiniopGetAvgPool1dWorkspaceSize
(
infiniopAvgPool1dDescriptor_t
desc
,
size_t
*
size
)
{
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<const op::avg_pool1d::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
GET
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
GET
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_HYGON_API
GET
(
INFINI_DEVICE_HYGON
,
nvidia
);
#endif
#ifdef ENABLE_MOORE_API
GET
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
GET
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_CAMBRICON_API
GET
(
INFINI_DEVICE_CAMBRICON
,
bang
);
#endif
#ifdef ENABLE_ASCEND_API
GET
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef GET
}
__INFINI_C
infiniStatus_t
infiniopAvgPool1d
(
infiniopAvgPool1dDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
void
*
stream
)
{
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::avg_pool1d::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, y, x, stream)
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
CALCULATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_HYGON_API
CALCULATE
(
INFINI_DEVICE_HYGON
,
nvidia
);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_CAMBRICON_API
CALCULATE
(
INFINI_DEVICE_CAMBRICON
,
bang
);
#endif
#ifdef ENABLE_ASCEND_API
CALCULATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CALCULATE
}
__INFINI_C
infiniStatus_t
infiniopDestroyAvgPool1dDescriptor
(
infiniopAvgPool1dDescriptor_t
desc
)
{
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::avg_pool1d::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
DELETE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_HYGON_API
DELETE
(
INFINI_DEVICE_HYGON
,
nvidia
);
#endif
#ifdef ENABLE_MOORE_API
DELETE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
DELETE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_CAMBRICON_API
DELETE
(
INFINI_DEVICE_CAMBRICON
,
bang
);
#endif
#ifdef ENABLE_ASCEND_API
DELETE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef DELETE
}
src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.cc
0 → 100644
View file @
45a3794b
#include "cross_entropy_cpu.h"
#include "../../../devices/cpu/common_cpu.h"
#include "../../../reduce/cpu/reduce.h"
#include <algorithm>
#include <cmath>
namespace
op
::
cross_entropy
::
cpu
{
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
,
infiniopTensorDescriptor_t
target_desc
)
{
auto
x_dtype
=
x_desc
->
dtype
();
auto
t_dtype
=
target_desc
->
dtype
();
CHECK_DTYPE
(
x_dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_BF16
);
CHECK_DTYPE
(
t_dtype
,
INFINI_DTYPE_I32
,
INFINI_DTYPE_I64
);
CrossEntropyInfo
info
{};
info
.
dtype
=
x_dtype
;
info
.
target_dtype
=
t_dtype
;
info
.
outer_size
=
target_desc
->
numel
();
info
.
vocab_size
=
x_desc
->
shape
().
back
();
info
.
x_stride
=
static_cast
<
ptrdiff_t
>
(
info
.
vocab_size
);
*
desc_ptr
=
new
Descriptor
(
nullptr
,
info
,
0
,
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
template
<
typename
T
,
typename
Tidx
>
infiniStatus_t
cross_entropy_kernel
(
const
CrossEntropyInfo
*
info
,
T
*
y
,
const
T
*
x
,
const
void
*
target
)
{
const
Tidx
*
label
=
reinterpret_cast
<
const
Tidx
*>
(
target
);
#pragma omp parallel for
for
(
ptrdiff_t
i
=
0
;
i
<
ptrdiff_t
(
info
->
outer_size
);
++
i
)
{
const
T
*
row
=
x
+
i
*
info
->
x_stride
;
Tidx
idx
=
label
[
i
];
if
(
idx
<
0
||
static_cast
<
size_t
>
(
idx
)
>=
info
->
vocab_size
)
{
y
[
i
]
=
utils
::
cast
<
T
>
(
0.
f
);
continue
;
}
float
max_val
=
op
::
common_cpu
::
reduce_op
::
max
(
row
,
info
->
vocab_size
,
1
);
float
sum_exp
=
0.
f
;
for
(
size_t
j
=
0
;
j
<
info
->
vocab_size
;
++
j
)
{
sum_exp
+=
std
::
exp
(
utils
::
cast
<
float
>
(
row
[
j
])
-
max_val
);
}
float
log_term
=
std
::
log
(
sum_exp
)
+
max_val
;
float
target_logit
=
utils
::
cast
<
float
>
(
row
[
idx
]);
y
[
i
]
=
utils
::
cast
<
T
>
(
log_term
-
target_logit
);
}
return
INFINI_STATUS_SUCCESS
;
}
template
<
typename
T
>
infiniStatus_t
dispatch_target_type
(
const
CrossEntropyInfo
*
info
,
T
*
y
,
const
T
*
x
,
const
void
*
target
)
{
if
(
info
->
target_dtype
==
INFINI_DTYPE_I32
)
{
return
cross_entropy_kernel
<
T
,
int32_t
>
(
info
,
y
,
x
,
target
);
}
else
if
(
info
->
target_dtype
==
INFINI_DTYPE_I64
)
{
return
cross_entropy_kernel
<
T
,
int64_t
>
(
info
,
y
,
x
,
target
);
}
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
const
void
*
target
,
void
*
stream
)
const
{
switch
(
_info
.
dtype
)
{
case
INFINI_DTYPE_F16
:
return
dispatch_target_type
(
&
_info
,
(
fp16_t
*
)
y
,
(
const
fp16_t
*
)
x
,
target
);
case
INFINI_DTYPE_BF16
:
return
dispatch_target_type
(
&
_info
,
(
bf16_t
*
)
y
,
(
const
bf16_t
*
)
x
,
target
);
case
INFINI_DTYPE_F32
:
return
dispatch_target_type
(
&
_info
,
(
float
*
)
y
,
(
const
float
*
)
x
,
target
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
}
}
// namespace op::cross_entropy::cpu
src/infiniop/ops/cross_entropy/cpu/cross_entropy_cpu.h
0 → 100644
View file @
45a3794b
#ifndef __CROSS_ENTROPY_CPU_H__
#define __CROSS_ENTROPY_CPU_H__
#include "../cross_entropy.h"
DESCRIPTOR
(
cpu
)
#endif
src/infiniop/ops/cross_entropy/cross_entropy.h
0 → 100644
View file @
45a3794b
#ifndef CROSS_ENTROPY_H
#define CROSS_ENTROPY_H
#include "../../operator.h"
#include "info.h"
#define DESCRIPTOR(NAMESPACE) \
namespace op::cross_entropy::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
CrossEntropyInfo _info; \
size_t _workspace_size; \
\
Descriptor(Opaque *opaque, \
CrossEntropyInfo info, \
size_t workspace_size, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size) {} \
\
public: \
~Descriptor(); \
size_t workspaceSize() const { return _workspace_size; } \
static infiniStatus_t create(infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t y_desc, \
infiniopTensorDescriptor_t x_desc, \
infiniopTensorDescriptor_t target_desc); \
infiniStatus_t calculate(void *workspace, \
size_t workspace_size, \
void *y, \
const void *x, \
const void *target, \
void *stream) const; \
}; \
}
#endif
src/infiniop/ops/cross_entropy/cuda/kernel.cuh
0 → 100644
View file @
45a3794b
#ifndef __CROSS_ENTROPY_KERNEL_CUH__
#define __CROSS_ENTROPY_KERNEL_CUH__
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "../../../reduce/cuda/reduce.cuh"
template
<
unsigned
int
BLOCK_SIZE
,
typename
Tdata
,
typename
Tidx
,
typename
Tcompute
=
float
>
__device__
void
crossEntropyKernel
(
Tdata
*
y_
,
const
Tdata
*
x_
,
const
void
*
target_
,
size_t
outer_size
,
size_t
vocab_size
,
ptrdiff_t
x_stride
)
{
size_t
row_idx
=
blockIdx
.
x
;
if
(
row_idx
>=
outer_size
)
{
return
;
}
const
Tdata
*
x
=
x_
+
row_idx
*
x_stride
;
const
Tidx
*
target
=
reinterpret_cast
<
const
Tidx
*>
(
target_
);
Tidx
label
=
target
[
row_idx
];
Tdata
max_val_raw
=
op
::
common_cuda
::
reduce_op
::
max
<
BLOCK_SIZE
,
Tdata
>
(
x
,
vocab_size
);
__shared__
Tcompute
max_val_shared
;
if
(
threadIdx
.
x
==
0
)
{
max_val_shared
=
static_cast
<
Tcompute
>
(
max_val_raw
);
}
__syncthreads
();
Tcompute
max_val
=
max_val_shared
;
Tcompute
thread_sum
=
0.0
f
;
for
(
size_t
col
=
threadIdx
.
x
;
col
<
vocab_size
;
col
+=
BLOCK_SIZE
)
{
Tcompute
val
=
static_cast
<
Tcompute
>
(
x
[
col
]);
thread_sum
+=
expf
(
val
-
max_val
);
}
for
(
int
offset
=
warpSize
/
2
;
offset
>
0
;
offset
/=
2
)
{
thread_sum
+=
__shfl_down_sync
(
0xffffffff
,
thread_sum
,
offset
);
}
static
__shared__
Tcompute
shared_sum
[
32
];
int
lane
=
threadIdx
.
x
%
warpSize
;
int
warp
=
threadIdx
.
x
/
warpSize
;
if
(
lane
==
0
)
{
shared_sum
[
warp
]
=
thread_sum
;
}
__syncthreads
();
Tcompute
block_sum
=
0.0
f
;
if
(
warp
==
0
)
{
if
(
lane
<
(
BLOCK_SIZE
+
warpSize
-
1
)
/
warpSize
)
{
block_sum
=
shared_sum
[
lane
];
}
for
(
int
offset
=
warpSize
/
2
;
offset
>
0
;
offset
/=
2
)
{
block_sum
+=
__shfl_down_sync
(
0xffffffff
,
block_sum
,
offset
);
}
}
if
(
threadIdx
.
x
==
0
)
{
Tcompute
log_term
=
logf
(
block_sum
)
+
max_val
;
Tcompute
target_logit
=
0.0
f
;
if
(
label
>=
0
&&
static_cast
<
size_t
>
(
label
)
<
vocab_size
)
{
target_logit
=
static_cast
<
Tcompute
>
(
x
[
label
]);
}
else
{
log_term
=
0.0
f
;
}
y_
[
row_idx
]
=
static_cast
<
Tdata
>
(
log_term
-
target_logit
);
}
}
#endif
src/infiniop/ops/cross_entropy/info.h
0 → 100644
View file @
45a3794b
#ifndef CROSS_ENTROPY_INFO_H
#define CROSS_ENTROPY_INFO_H
#include "../../../utils.h"
#include "../../tensor.h"
#include <vector>
#include <cstddef>
struct
CrossEntropyInfo
{
int
dtype
;
int
target_dtype
;
size_t
outer_size
;
size_t
vocab_size
;
ptrdiff_t
x_stride
;
};
#endif
src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.h
0 → 100644
View file @
45a3794b
#ifndef __CROSS_ENTROPY_METAX_H__
#define __CROSS_ENTROPY_METAX_H__
#include "../cross_entropy.h"
DESCRIPTOR
(
metax
)
#endif // __CROSS_ENTROPY_METAX_H__
src/infiniop/ops/cross_entropy/metax/cross_entropy_metax.maca
0 → 100644
View file @
45a3794b
#include "../../../devices/metax/metax_common.h"
#include "cross_entropy_metax.h"
#include "../../../devices/metax/metax_kernel_common.h"
#include <cub/block/block_reduce.cuh>
#include "../../../reduce/cuda/reduce.cuh"
#include <cmath>
namespace {
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tidx, typename Tcompute>
__device__ void crossEntropyKernel(
Tdata *y_,
const Tdata *x_,
const void *target_,
size_t outer_size,
size_t vocab_size,
ptrdiff_t x_stride) {
size_t row_idx = blockIdx.x;
if (row_idx >= outer_size) {
return;
}
const Tdata *x = x_ + row_idx * x_stride;
const Tidx *target = reinterpret_cast<const Tidx *>(target_);
Tidx label = target[row_idx];
Tdata max_val_raw = op::common_cuda::reduce_op::max<BLOCK_SIZE, Tdata>(x, vocab_size);
__shared__ Tcompute max_val_shared;
if (threadIdx.x == 0) {
max_val_shared = static_cast<Tcompute>(max_val_raw);
}
__syncthreads();
Tcompute max_val = max_val_shared;
Tcompute thread_sum = Tcompute(0);
for (size_t col = threadIdx.x; col < vocab_size; col += BLOCK_SIZE) {
Tcompute val = static_cast<Tcompute>(x[col]);
thread_sum += expf(val - max_val);
}
using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
__shared__ typename BlockReduce::TempStorage temp_storage;
Tcompute block_sum = BlockReduce(temp_storage).Sum(thread_sum);
if (threadIdx.x == 0) {
if (label < 0 || static_cast<size_t>(label) >= vocab_size) {
y_[row_idx] = static_cast<Tdata>(0.0f);
return;
}
Tcompute log_term = logf(block_sum) + max_val;
Tcompute target_logit = static_cast<Tcompute>(x[label]);
y_[row_idx] = static_cast<Tdata>(log_term - target_logit);
}
}
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tidx, typename Tcompute>
INFINIOP_METAX_KERNEL crossEntropy(
Tdata *y, const Tdata *x, const void *target,
size_t outer_size, size_t vocab_size, ptrdiff_t x_stride) {
crossEntropyKernel<BLOCK_SIZE, Tdata, Tidx, Tcompute>(
y, x, target, outer_size, vocab_size, x_stride);
}
} // namespace
namespace op::cross_entropy::metax {
struct Descriptor::Opaque {
std::shared_ptr<device::metax::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t target_desc) {
(void)y_desc;
auto x_dtype = x_desc->dtype();
auto t_dtype = target_desc->dtype();
CHECK_DTYPE(x_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
CHECK_DTYPE(t_dtype, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
CrossEntropyInfo info{};
info.dtype = x_dtype;
info.target_dtype = t_dtype;
info.vocab_size = x_desc->shape().back();
info.outer_size = target_desc->numel();
info.x_stride = static_cast<ptrdiff_t>(info.vocab_size);
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
info, 0, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <unsigned int BLOCK_SIZE>
infiniStatus_t launchKernel(void *y, const void *x, const void *target,
const CrossEntropyInfo &info, hcStream_t stream) {
dim3 grid(static_cast<uint32_t>(info.outer_size), 1, 1);
if (info.target_dtype == INFINI_DTYPE_I64) {
if (info.dtype == INFINI_DTYPE_F16) {
crossEntropy<BLOCK_SIZE, half, int64_t, float>
<<<grid, BLOCK_SIZE, 0, stream>>>(
(half *)y, (const half *)x, target,
info.outer_size, info.vocab_size, info.x_stride);
} else if (info.dtype == INFINI_DTYPE_BF16) {
crossEntropy<BLOCK_SIZE, cuda_bfloat16, int64_t, float>
<<<grid, BLOCK_SIZE, 0, stream>>>(
(cuda_bfloat16 *)y, (const cuda_bfloat16 *)x, target,
info.outer_size, info.vocab_size, info.x_stride);
} else if (info.dtype == INFINI_DTYPE_F32) {
crossEntropy<BLOCK_SIZE, float, int64_t, float>
<<<grid, BLOCK_SIZE, 0, stream>>>(
(float *)y, (const float *)x, target,
info.outer_size, info.vocab_size, info.x_stride);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
} else if (info.target_dtype == INFINI_DTYPE_I32) {
if (info.dtype == INFINI_DTYPE_F16) {
crossEntropy<BLOCK_SIZE, half, int32_t, float>
<<<grid, BLOCK_SIZE, 0, stream>>>(
(half *)y, (const half *)x, target,
info.outer_size, info.vocab_size, info.x_stride);
} else if (info.dtype == INFINI_DTYPE_BF16) {
crossEntropy<BLOCK_SIZE, cuda_bfloat16, int32_t, float>
<<<grid, BLOCK_SIZE, 0, stream>>>(
(cuda_bfloat16 *)y, (const cuda_bfloat16 *)x, target,
info.outer_size, info.vocab_size, info.x_stride);
} else if (info.dtype == INFINI_DTYPE_F32) {
crossEntropy<BLOCK_SIZE, float, int32_t, float>
<<<grid, BLOCK_SIZE, 0, stream>>>(
(float *)y, (const float *)x, target,
info.outer_size, info.vocab_size, info.x_stride);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *y,
const void *x,
const void *target,
void *stream_) const {
(void)workspace;
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
auto stream = reinterpret_cast<hcStream_t>(stream_);
int max_threads = _opaque->internal->maxThreadsPerBlock();
if (max_threads >= METAX_BLOCK_SIZE_1024) {
CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_1024>(y, x, target, _info, stream));
} else if (max_threads >= METAX_BLOCK_SIZE_512) {
CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_512>(y, x, target, _info, stream));
} else {
CHECK_STATUS(launchKernel<256>(y, x, target, _info, stream));
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::cross_entropy::metax
src/infiniop/ops/cross_entropy/moore/cross_entropy_kernel.h
0 → 100644
View file @
45a3794b
#ifndef __CROSS_ENTROPY_KERNEL_CUH__
#define __CROSS_ENTROPY_KERNEL_CUH__
template
<
unsigned
int
BLOCK_SIZE
,
typename
Tdata
,
typename
Tidx
,
typename
Tcompute
>
__device__
void
crossEntropyKernel
(
Tdata
*
y_
,
const
Tdata
*
x_
,
const
void
*
target_
,
size_t
outer_size
,
size_t
vocab_size
,
ptrdiff_t
x_stride
)
{
size_t
row_idx
=
blockIdx
.
x
;
if
(
row_idx
>=
outer_size
)
{
return
;
}
const
Tdata
*
x
=
x_
+
row_idx
*
x_stride
;
const
Tidx
*
target
=
reinterpret_cast
<
const
Tidx
*>
(
target_
);
Tidx
label
=
target
[
row_idx
];
Tdata
max_val_raw
=
op
::
common_cuda
::
reduce_op
::
max
<
BLOCK_SIZE
,
Tdata
>
(
x
,
vocab_size
);
__shared__
Tcompute
max_val_shared
;
if
(
threadIdx
.
x
==
0
)
{
max_val_shared
=
static_cast
<
Tcompute
>
(
max_val_raw
);
}
__syncthreads
();
Tcompute
max_val
=
max_val_shared
;
Tcompute
thread_sum
=
Tcompute
(
0
);
for
(
size_t
col
=
threadIdx
.
x
;
col
<
vocab_size
;
col
+=
BLOCK_SIZE
)
{
Tcompute
val
=
static_cast
<
Tcompute
>
(
x
[
col
]);
thread_sum
+=
expf
(
val
-
max_val
);
}
using
BlockReduce
=
cub
::
BlockReduce
<
Tcompute
,
BLOCK_SIZE
>
;
__shared__
typename
BlockReduce
::
TempStorage
temp_storage
;
Tcompute
block_sum
=
BlockReduce
(
temp_storage
).
Sum
(
thread_sum
);
if
(
threadIdx
.
x
==
0
)
{
if
(
label
<
0
||
static_cast
<
size_t
>
(
label
)
>=
vocab_size
)
{
y_
[
row_idx
]
=
static_cast
<
Tdata
>
(
0.0
f
);
return
;
}
Tcompute
log_term
=
logf
(
block_sum
)
+
max_val
;
Tcompute
target_logit
=
static_cast
<
Tcompute
>
(
x
[
label
]);
y_
[
row_idx
]
=
static_cast
<
Tdata
>
(
log_term
-
target_logit
);
}
}
#endif
src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.h
0 → 100644
View file @
45a3794b
#ifndef __CROSS_ENTROPY_MOORE_H__
#define __CROSS_ENTROPY_MOORE_H__
#include "../cross_entropy.h"
DESCRIPTOR
(
moore
)
#endif
src/infiniop/ops/cross_entropy/moore/cross_entropy_moore.mu
0 → 100644
View file @
45a3794b
#include "../../../devices/moore/moore_common.h"
#include "cross_entropy_moore.h"
#include <cub/block/block_reduce.cuh>
#include "../../../devices/moore/moore_kernel_common.h"
#include "../../../reduce/cuda/reduce.cuh"
#include "cross_entropy_kernel.h"
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tidx, typename Tcompute>
INFINIOP_MOORE_KERNEL crossEntropy(
Tdata *y, const Tdata *x, const void *target,
size_t outer_size, size_t vocab_size, ptrdiff_t x_stride) {
crossEntropyKernel<BLOCK_SIZE, Tdata, Tidx, Tcompute>(
y, x, target, outer_size, vocab_size, x_stride);
}
namespace op::cross_entropy::moore {
struct Descriptor::Opaque {
std::shared_ptr<device::moore::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t target_desc) {
(void)y_desc;
auto x_dtype = x_desc->dtype();
auto t_dtype = target_desc->dtype();
CHECK_DTYPE(x_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
CHECK_DTYPE(t_dtype, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
CrossEntropyInfo info{};
info.dtype = x_dtype;
info.target_dtype = t_dtype;
info.vocab_size = x_desc->shape().back();
info.outer_size = target_desc->numel();
info.x_stride = static_cast<ptrdiff_t>(info.vocab_size);
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
info, 0, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <unsigned int BLOCK_SIZE>
infiniStatus_t launchKernel(void *y, const void *x, const void *target,
const CrossEntropyInfo &info, musaStream_t stream) {
dim3 grid(static_cast<uint32_t>(info.outer_size), 1, 1);
if (info.target_dtype == INFINI_DTYPE_I64) {
if (info.dtype == INFINI_DTYPE_F16) {
crossEntropy<BLOCK_SIZE, half, int64_t, float>
<<<grid, BLOCK_SIZE, 0, stream>>>(
(half *)y, (const half *)x, target,
info.outer_size, info.vocab_size, info.x_stride);
} else if (info.dtype == INFINI_DTYPE_BF16) {
crossEntropy<BLOCK_SIZE, __mt_bfloat16, int64_t, float>
<<<grid, BLOCK_SIZE, 0, stream>>>(
(__mt_bfloat16 *)y, (const __mt_bfloat16 *)x, target,
info.outer_size, info.vocab_size, info.x_stride);
} else if (info.dtype == INFINI_DTYPE_F32) {
crossEntropy<BLOCK_SIZE, float, int64_t, float>
<<<grid, BLOCK_SIZE, 0, stream>>>(
(float *)y, (const float *)x, target,
info.outer_size, info.vocab_size, info.x_stride);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
} else if (info.target_dtype == INFINI_DTYPE_I32) {
if (info.dtype == INFINI_DTYPE_F16) {
crossEntropy<BLOCK_SIZE, half, int32_t, float>
<<<grid, BLOCK_SIZE, 0, stream>>>(
(half *)y, (const half *)x, target,
info.outer_size, info.vocab_size, info.x_stride);
} else if (info.dtype == INFINI_DTYPE_BF16) {
crossEntropy<BLOCK_SIZE, __mt_bfloat16, int32_t, float>
<<<grid, BLOCK_SIZE, 0, stream>>>(
(__mt_bfloat16 *)y, (const __mt_bfloat16 *)x, target,
info.outer_size, info.vocab_size, info.x_stride);
} else if (info.dtype == INFINI_DTYPE_F32) {
crossEntropy<BLOCK_SIZE, float, int32_t, float>
<<<grid, BLOCK_SIZE, 0, stream>>>(
(float *)y, (const float *)x, target,
info.outer_size, info.vocab_size, info.x_stride);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
void *y,
const void *x,
const void *target,
void *stream_) const {
musaStream_t stream = (musaStream_t)stream_;
(void)workspace;
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) {
CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_1024>(y, x, target, _info, stream));
} else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) {
CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_512>(y, x, target, _info, stream));
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::cross_entropy::moore
src/infiniop/ops/cross_entropy/nvidia/cross_entropy_nvidia.cu
0 → 100644
View file @
45a3794b
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include "../cuda/kernel.cuh"
#include "cross_entropy_nvidia.cuh"
template
<
unsigned
int
BLOCK_SIZE
,
typename
Tdata
,
typename
Tidx
,
typename
Tcompute
=
float
>
INFINIOP_CUDA_KERNEL
crossEntropy
(
Tdata
*
y
,
const
Tdata
*
x
,
const
void
*
target
,
size_t
outer_size
,
size_t
vocab_size
,
ptrdiff_t
x_stride
)
{
crossEntropyKernel
<
BLOCK_SIZE
,
Tdata
,
Tidx
,
Tcompute
>
(
y
,
x
,
target
,
outer_size
,
vocab_size
,
x_stride
);
}
namespace
op
::
cross_entropy
::
nvidia
{
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
nvidia
::
Handle
::
Internal
>
internal
;
};
Descriptor
::~
Descriptor
()
{
delete
_opaque
;
}
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
,
infiniopTensorDescriptor_t
target_desc
)
{
auto
x_dtype
=
x_desc
->
dtype
();
auto
t_dtype
=
target_desc
->
dtype
();
CrossEntropyInfo
info
;
info
.
dtype
=
x_dtype
;
info
.
target_dtype
=
t_dtype
;
info
.
vocab_size
=
x_desc
->
shape
().
back
();
info
.
outer_size
=
target_desc
->
numel
();
info
.
x_stride
=
static_cast
<
ptrdiff_t
>
(
info
.
vocab_size
);
auto
internal
=
reinterpret_cast
<
device
::
nvidia
::
Handle
*>
(
handle
)
->
internal
();
*
desc_ptr
=
new
Descriptor
(
new
Opaque
{
internal
},
info
,
0
,
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
template
<
unsigned
int
BLOCK_SIZE
>
infiniStatus_t
launchKernel
(
void
*
y
,
const
void
*
x
,
const
void
*
target
,
const
CrossEntropyInfo
&
info
,
cudaStream_t
stream
)
{
dim3
grid
(
static_cast
<
uint32_t
>
(
info
.
outer_size
),
1
,
1
);
if
(
info
.
target_dtype
==
INFINI_DTYPE_I64
)
{
if
(
info
.
dtype
==
INFINI_DTYPE_F16
)
{
crossEntropy
<
BLOCK_SIZE
,
half
,
int64_t
>
<<<
grid
,
BLOCK_SIZE
,
0
,
stream
>>>
((
half
*
)
y
,
(
const
half
*
)
x
,
target
,
info
.
outer_size
,
info
.
vocab_size
,
info
.
x_stride
);
}
else
if
(
info
.
dtype
==
INFINI_DTYPE_BF16
)
{
crossEntropy
<
BLOCK_SIZE
,
__nv_bfloat16
,
int64_t
>
<<<
grid
,
BLOCK_SIZE
,
0
,
stream
>>>
((
__nv_bfloat16
*
)
y
,
(
const
__nv_bfloat16
*
)
x
,
target
,
info
.
outer_size
,
info
.
vocab_size
,
info
.
x_stride
);
}
else
if
(
info
.
dtype
==
INFINI_DTYPE_F32
)
{
crossEntropy
<
BLOCK_SIZE
,
float
,
int64_t
>
<<<
grid
,
BLOCK_SIZE
,
0
,
stream
>>>
((
float
*
)
y
,
(
const
float
*
)
x
,
target
,
info
.
outer_size
,
info
.
vocab_size
,
info
.
x_stride
);
}
}
else
if
(
info
.
target_dtype
==
INFINI_DTYPE_I32
)
{
if
(
info
.
dtype
==
INFINI_DTYPE_F16
)
{
crossEntropy
<
BLOCK_SIZE
,
half
,
int32_t
>
<<<
grid
,
BLOCK_SIZE
,
0
,
stream
>>>
((
half
*
)
y
,
(
const
half
*
)
x
,
target
,
info
.
outer_size
,
info
.
vocab_size
,
info
.
x_stride
);
}
else
if
(
info
.
dtype
==
INFINI_DTYPE_BF16
)
{
crossEntropy
<
BLOCK_SIZE
,
__nv_bfloat16
,
int32_t
>
<<<
grid
,
BLOCK_SIZE
,
0
,
stream
>>>
((
__nv_bfloat16
*
)
y
,
(
const
__nv_bfloat16
*
)
x
,
target
,
info
.
outer_size
,
info
.
vocab_size
,
info
.
x_stride
);
}
else
if
(
info
.
dtype
==
INFINI_DTYPE_F32
)
{
crossEntropy
<
BLOCK_SIZE
,
float
,
int32_t
>
<<<
grid
,
BLOCK_SIZE
,
0
,
stream
>>>
((
float
*
)
y
,
(
const
float
*
)
x
,
target
,
info
.
outer_size
,
info
.
vocab_size
,
info
.
x_stride
);
}
}
else
{
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
const
void
*
target
,
void
*
stream_
)
const
{
cudaStream_t
stream
=
(
cudaStream_t
)
stream_
;
int
max_threads
=
_opaque
->
internal
->
maxThreadsPerBlock
();
if
(
max_threads
>=
1024
)
{
CHECK_STATUS
(
launchKernel
<
1024
>
(
y
,
x
,
target
,
_info
,
stream
));
}
else
if
(
max_threads
>=
512
)
{
CHECK_STATUS
(
launchKernel
<
512
>
(
y
,
x
,
target
,
_info
,
stream
));
}
else
{
CHECK_STATUS
(
launchKernel
<
256
>
(
y
,
x
,
target
,
_info
,
stream
));
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::cross_entropy::nvidia
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment