Commit cb7f0b7d authored by wooway777's avatar wooway777
Browse files

Revert "Merge pull request #1056 from InfiniTensor/issue/1031"

This reverts commit 7f295448, reversing
changes made to e60985dc.
parent 037140c0
#ifndef __ASINH_MOORE_API_H__
#define __ASINH_MOORE_API_H__
#include "../../../elementwise/moore/elementwise_moore_api.h"
ELEMENTWISE_DESCRIPTOR(asinh, moore)
#endif // __ASINH_MOORE_API_H__
#include "asinh_moore.h"
#include "../../../elementwise/moore/elementwise_moore.h"
#include "../cuda/kernel.cuh"
namespace op::asinh::moore {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &x_desc = input_desc_vec.at(0);
const auto &y_shape = out_desc->shape();
const auto &x_shape = x_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(y_shape, x_shape);
// create MOORE elementwise descriptor
CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::AsinhOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::AsinhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::AsinhOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::AsinhOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::asinh::moore
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
#include "../cuda/kernel.cuh"
#include "asinh_nvidia.cuh"
namespace op::asinh::nvidia {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &x_desc = input_desc_vec.at(0);
const auto &y_shape = out_desc->shape();
const auto &x_shape = x_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(y_shape, x_shape);
CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::AsinhOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::AsinhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::AsinhOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::AsinhOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
} // namespace op::asinh::nvidia
#ifndef __ASINH_NVIDIA_API_H__
#define __ASINH_NVIDIA_API_H__
#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
ELEMENTWISE_DESCRIPTOR(asinh, nvidia)
#endif // __ASINH_NVIDIA_API_H
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/asinh.h"
#ifdef ENABLE_CPU_API
#include "cpu/asinh_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#include "nvidia/asinh_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
#include "metax/asinh_metax.h"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/asinh_moore.h"
#endif
__INFINI_C infiniStatus_t infiniopCreateAsinhDescriptor(
infiniopHandle_t handle,
infiniopAsinhDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::asinh::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::asinh::NAMESPACE::Descriptor **>(desc_ptr), \
y_desc, \
{x_desc})
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_MOORE_API
CREATE(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CREATE
}
__INFINI_C infiniStatus_t infiniopGetAsinhWorkspaceSize(infiniopAsinhDescriptor_t desc, size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<const op::asinh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_MOORE_API
GET(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef GET
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__INFINI_C infiniStatus_t infiniopAsinh(infiniopAsinhDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::asinh::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, y, {x}, stream);
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CALCULATE
}
__INFINI_C infiniStatus_t infiniopDestroyAsinhDescriptor(infiniopAsinhDescriptor_t desc) {
#define DESTROY(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<op::asinh::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
DESTROY(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
DESTROY(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_METAX_API
DESTROY(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_MOORE_API
DESTROY(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef DESTROY
}
#ifndef __AVG_POOL1D_H__
#define __AVG_POOL1D_H__
#include "../../../utils.h"
#include "../../operator.h"
#include "../../tensor.h"
#include "infiniop/ops/avg_pool1d.h"
#define DESCRIPTOR(NAMESPACE) \
namespace op::avg_pool1d::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
AvgPool1dInfo _info; \
size_t _workspace_size; \
\
Descriptor( \
AvgPool1dInfo info, \
size_t workspace_size_, \
Opaque *opaque, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size_) {} \
\
public: \
~Descriptor(); \
\
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t y_desc, \
infiniopTensorDescriptor_t x_desc, \
size_t kernel_size, \
size_t stride, \
size_t padding); \
\
infiniStatus_t calculate( \
void *workspace, \
size_t workspace_size, \
void *y, \
const void *x, \
void *stream) const; \
}; \
}
class AvgPool1dInfo {
private:
AvgPool1dInfo() = default;
public:
infiniDtype_t dtype;
size_t batch, channels, in_width, out_width;
size_t kernel_size, stride, padding;
ptrdiff_t y_stride_batch, y_stride_channel, y_stride_width;
ptrdiff_t x_stride_batch, x_stride_channel, x_stride_width;
static utils::Result<AvgPool1dInfo> createAvgPool1dInfo(
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
size_t kernel_size,
size_t stride,
size_t padding) {
CHECK_OR_RETURN(y_desc != nullptr && x_desc != nullptr, INFINI_STATUS_NULL_POINTER);
const infiniDtype_t dtype = y_desc->dtype();
CHECK_OR_RETURN(dtype == x_desc->dtype(), INFINI_STATUS_BAD_TENSOR_DTYPE);
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_OR_RETURN(y_desc->ndim() == 3 && x_desc->ndim() == 3, INFINI_STATUS_BAD_TENSOR_SHAPE);
size_t batch = x_desc->dim(0);
size_t channels = x_desc->dim(1);
size_t in_width = x_desc->dim(2);
CHECK_OR_RETURN(y_desc->dim(0) == batch, INFINI_STATUS_BAD_TENSOR_SHAPE);
CHECK_OR_RETURN(y_desc->dim(1) == channels, INFINI_STATUS_BAD_TENSOR_SHAPE);
size_t padded_len = in_width + 2 * padding;
CHECK_OR_RETURN(padded_len >= kernel_size, INFINI_STATUS_BAD_TENSOR_SHAPE);
size_t expected_out_width = (padded_len - kernel_size) / stride + 1;
CHECK_OR_RETURN(y_desc->dim(2) == expected_out_width, INFINI_STATUS_BAD_TENSOR_SHAPE);
size_t out_width = expected_out_width;
return utils::Result<AvgPool1dInfo>(AvgPool1dInfo{
dtype,
batch, channels, in_width, out_width,
kernel_size, stride, padding,
y_desc->stride(0), y_desc->stride(1), y_desc->stride(2),
x_desc->stride(0), x_desc->stride(1), x_desc->stride(2)});
}
};
#endif
#include "avg_pool1d_cpu.h"
#include "../../../devices/cpu/common_cpu.h"
#include <algorithm>
namespace op::avg_pool1d::cpu {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
size_t kernel_size,
size_t stride,
size_t padding) {
auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding);
CHECK_RESULT(info);
*desc_ptr = new Descriptor(
info.take(),
0,
nullptr,
handle->device,
handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <typename T>
infiniStatus_t calculateAvgPool1d(const AvgPool1dInfo &info,
T *y,
const T *x) {
const float inv_kernel = 1.0f / static_cast<float>(info.kernel_size);
#pragma omp parallel for
for (ptrdiff_t bc = 0; bc < ptrdiff_t(info.batch * info.channels); ++bc) {
ptrdiff_t b = bc / info.channels;
ptrdiff_t c = bc % info.channels;
size_t y_base = b * info.y_stride_batch + c * info.y_stride_channel;
size_t x_base = b * info.x_stride_batch + c * info.x_stride_channel;
for (size_t ow = 0; ow < info.out_width; ++ow) {
size_t y_offset = y_base + ow * info.y_stride_width;
long long start_w = static_cast<long long>(ow * info.stride) - info.padding;
long long end_w = start_w + info.kernel_size;
long long valid_start = std::max(0LL, start_w);
long long valid_end = std::min(static_cast<long long>(info.in_width), end_w);
float sum = 0.0f;
for (long long iw = valid_start; iw < valid_end; ++iw) {
size_t x_offset = x_base + iw * info.x_stride_width;
sum += utils::cast<float>(x[x_offset]);
}
const float avg = sum * inv_kernel;
y[y_offset] = utils::cast<T>(avg);
}
}
return INFINI_STATUS_SUCCESS;
}
#define CALCULATE(TDATA) calculateAvgPool1d(_info, (TDATA *)y, (const TDATA *)x)
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *y,
const void *x,
void *stream) const {
switch (_info.dtype) {
case INFINI_DTYPE_F16:
return CALCULATE(fp16_t);
case INFINI_DTYPE_BF16:
return CALCULATE(bf16_t);
case INFINI_DTYPE_F32:
return CALCULATE(float);
case INFINI_DTYPE_F64:
return CALCULATE(double);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
#undef CALCULATE
} // namespace op::avg_pool1d::cpu
#ifndef __INFINIOP_AVG_POOL1D_CPU_H__
#define __INFINIOP_AVG_POOL1D_CPU_H__
#include "../avg_pool1d.h"
DESCRIPTOR(cpu)
#endif
#ifndef __INFINIOP_AVG_POOL1D_CUDA_KERNEL_CUH__
#define __INFINIOP_AVG_POOL1D_CUDA_KERNEL_CUH__
template <typename T>
__device__ void avgPool1dKernel(
T *y,
const T *x,
size_t batch,
size_t channels,
size_t in_width,
size_t out_width,
size_t kernel_size,
size_t stride,
size_t padding,
ptrdiff_t y_stride_batch,
ptrdiff_t y_stride_channel,
ptrdiff_t y_stride_width,
ptrdiff_t x_stride_batch,
ptrdiff_t x_stride_channel,
ptrdiff_t x_stride_width) {
size_t total_elements = batch * channels * out_width;
for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
idx < total_elements;
idx += gridDim.x * blockDim.x) {
size_t ow = idx % out_width;
size_t temp = idx / out_width;
size_t c = temp % channels;
size_t b = temp / channels;
size_t y_offset = b * y_stride_batch + c * y_stride_channel + ow * y_stride_width;
long long start_w = static_cast<long long>(ow * stride) - padding;
T sum = 0;
for (size_t k = 0; k < kernel_size; ++k) {
long long iw = start_w + k;
if (iw >= 0 && iw < static_cast<long long>(in_width)) {
size_t x_offset = b * x_stride_batch + c * x_stride_channel + iw * x_stride_width;
sum += x[x_offset];
}
}
#if defined(ENABLE_ILUVATAR_API)
// Iluvatar __half doesn't accept size_t directly.
y[y_offset] = sum / static_cast<T>(static_cast<double>(kernel_size));
#else
y[y_offset] = sum / static_cast<T>(kernel_size);
#endif
}
}
#endif
#ifndef __INFINIOP_AVG_POOL1D_METAX_H__
#define __INFINIOP_AVG_POOL1D_METAX_H__
#include "../avg_pool1d.h"
DESCRIPTOR(metax)
#endif // __INFINIOP_AVG_POOL1D_METAX_H__
#include "../../../devices/metax/metax_common.h"
#include "avg_pool1d_metax.h"
#include "../../../devices/metax/metax_kernel_common.h"
#include <type_traits>
namespace op::avg_pool1d::metax {
struct Descriptor::Opaque {
std::shared_ptr<device::metax::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
size_t kernel_size,
size_t stride,
size_t padding) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding);
CHECK_RESULT(info);
*desc_ptr = new Descriptor(
info.take(),
0,
new Opaque{handle->internal()},
handle->device,
handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <typename Tdata, typename Tcompute>
__device__ __forceinline__ Tdata castToOutput(Tcompute val) {
if constexpr (std::is_same_v<Tdata, half>) {
return __float2half(static_cast<float>(val));
} else if constexpr (std::is_same_v<Tdata, cuda_bfloat16>) {
return __float2bfloat16(static_cast<float>(val));
} else {
return static_cast<Tdata>(val);
}
}
template <typename Tdata, typename Tcompute>
INFINIOP_METAX_KERNEL avgPool1dGlobalKernel(
Tdata *y,
const Tdata *x,
size_t batch,
size_t channels,
size_t in_width,
size_t out_width,
size_t kernel_size,
size_t stride,
size_t padding,
ptrdiff_t y_stride_batch,
ptrdiff_t y_stride_channel,
ptrdiff_t y_stride_width,
ptrdiff_t x_stride_batch,
ptrdiff_t x_stride_channel,
ptrdiff_t x_stride_width) {
size_t total_elements = batch * channels * out_width;
Tcompute inv_kernel = Tcompute(1) / static_cast<Tcompute>(kernel_size);
for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
idx < total_elements;
idx += gridDim.x * blockDim.x) {
size_t ow = idx % out_width;
size_t temp = idx / out_width;
size_t c = temp % channels;
size_t b = temp / channels;
size_t y_offset = b * y_stride_batch + c * y_stride_channel + ow * y_stride_width;
size_t x_base = b * x_stride_batch + c * x_stride_channel;
long long start_w = static_cast<long long>(ow * stride) - static_cast<long long>(padding);
long long end_w = start_w + static_cast<long long>(kernel_size);
long long iw_start = start_w < 0 ? 0 : start_w;
long long iw_end = end_w > static_cast<long long>(in_width) ? static_cast<long long>(in_width) : end_w;
Tcompute sum = Tcompute(0);
if (iw_start < iw_end) {
size_t x_offset = x_base + static_cast<size_t>(iw_start) * x_stride_width;
for (long long iw = iw_start; iw < iw_end; ++iw) {
sum += static_cast<Tcompute>(x[x_offset]);
x_offset += x_stride_width;
}
}
y[y_offset] = castToOutput<Tdata, Tcompute>(sum * inv_kernel);
}
}
template <typename Tdata, typename Tcompute>
infiniStatus_t calculateAvgPool1d(
const AvgPool1dInfo &info,
int max_threads_per_block,
Tdata *y,
const Tdata *x,
hcStream_t stream) {
size_t total_elements = info.batch * info.channels * info.out_width;
int block_size = 256;
if (max_threads_per_block > 0 && max_threads_per_block < block_size) {
block_size = max_threads_per_block;
}
size_t grid_size = (total_elements + block_size - 1) / block_size;
if (grid_size > 65535) {
grid_size = 65535;
}
avgPool1dGlobalKernel<Tdata, Tcompute><<<grid_size, block_size, 0, stream>>>(
y, x,
info.batch, info.channels, info.in_width, info.out_width,
info.kernel_size, info.stride, info.padding,
info.y_stride_batch, info.y_stride_channel, info.y_stride_width,
info.x_stride_batch, info.x_stride_channel, info.x_stride_width);
return INFINI_STATUS_SUCCESS;
}
#define CALCULATE(TDATA, TCOMPUTE) \
calculateAvgPool1d<TDATA, TCOMPUTE>( \
_info, \
_opaque->internal->maxThreadsPerBlock(), \
(TDATA *)y, \
(const TDATA *)x, \
(hcStream_t)stream)
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *y,
const void *x,
void *stream) const {
(void)workspace;
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_info.dtype) {
case INFINI_DTYPE_F16:
return CALCULATE(half, float);
case INFINI_DTYPE_BF16:
return CALCULATE(cuda_bfloat16, float);
case INFINI_DTYPE_F32:
return CALCULATE(float, float);
case INFINI_DTYPE_F64:
return CALCULATE(double, double);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
#undef CALCULATE
} // namespace op::avg_pool1d::metax
#ifndef __INFINIOP_AVG_POOL1D_MOORE_KERNEL_H__
#define __INFINIOP_AVG_POOL1D_MOORE_KERNEL_H__
#include <type_traits>
namespace op::avg_pool1d::moore {
template <typename Tdata, typename Tcompute>
__device__ __forceinline__ Tdata castToOutput(Tcompute val) {
if constexpr (std::is_same_v<Tdata, half>) {
return __float2half(static_cast<float>(val));
} else if constexpr (std::is_same_v<Tdata, cuda_bfloat16>) {
return __float2bfloat16_rn(static_cast<float>(val));
} else {
return static_cast<Tdata>(val);
}
}
template <typename Tdata, typename Tcompute>
__device__ void avgPool1dKernel(
Tdata *y,
const Tdata *x,
size_t batch,
size_t channels,
size_t in_width,
size_t out_width,
size_t kernel_size,
size_t stride,
size_t padding,
ptrdiff_t y_stride_batch,
ptrdiff_t y_stride_channel,
ptrdiff_t y_stride_width,
ptrdiff_t x_stride_batch,
ptrdiff_t x_stride_channel,
ptrdiff_t x_stride_width) {
size_t total_elements = batch * channels * out_width;
Tcompute inv_kernel = Tcompute(1) / static_cast<Tcompute>(kernel_size);
for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
idx < total_elements;
idx += gridDim.x * blockDim.x) {
size_t ow = idx % out_width;
size_t temp = idx / out_width;
size_t c = temp % channels;
size_t b = temp / channels;
size_t y_offset = b * y_stride_batch + c * y_stride_channel + ow * y_stride_width;
size_t x_base = b * x_stride_batch + c * x_stride_channel;
long long start_w = static_cast<long long>(ow * stride) - static_cast<long long>(padding);
long long end_w = start_w + static_cast<long long>(kernel_size);
long long iw_start = start_w < 0 ? 0 : start_w;
long long iw_end = end_w > static_cast<long long>(in_width) ? static_cast<long long>(in_width) : end_w;
Tcompute sum = Tcompute(0);
if (iw_start < iw_end) {
size_t x_offset = x_base + static_cast<size_t>(iw_start) * x_stride_width;
for (long long iw = iw_start; iw < iw_end; ++iw) {
sum += static_cast<Tcompute>(x[x_offset]);
x_offset += x_stride_width;
}
}
y[y_offset] = castToOutput<Tdata, Tcompute>(sum * inv_kernel);
}
}
} // namespace op::avg_pool1d::moore
#endif // __INFINIOP_AVG_POOL1D_MOORE_KERNEL_H__
#ifndef __INFINIOP_AVG_POOL1D_MOORE_H__
#define __INFINIOP_AVG_POOL1D_MOORE_H__
#include "../avg_pool1d.h"
DESCRIPTOR(moore)
#endif // __INFINIOP_AVG_POOL1D_MOORE_H__
#include "../../../devices/moore/moore_common.h"
#include "avg_pool1d_moore.h"
#include "../../../devices/moore/moore_kernel_common.h"
#include "avg_pool1d_kernel.h"
namespace op::avg_pool1d::moore {
struct Descriptor::Opaque {
std::shared_ptr<device::moore::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
size_t kernel_size,
size_t stride,
size_t padding) {
auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding);
CHECK_RESULT(info);
*desc_ptr = new Descriptor(
info.take(),
0,
new Opaque{handle->internal()},
handle->device,
handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <typename Tdata, typename Tcompute>
INFINIOP_MOORE_KERNEL avgPool1dGlobalKernel(
Tdata *y,
const Tdata *x,
size_t batch,
size_t channels,
size_t in_width,
size_t out_width,
size_t kernel_size,
size_t stride,
size_t padding,
ptrdiff_t y_stride_batch,
ptrdiff_t y_stride_channel,
ptrdiff_t y_stride_width,
ptrdiff_t x_stride_batch,
ptrdiff_t x_stride_channel,
ptrdiff_t x_stride_width) {
avgPool1dKernel<Tdata, Tcompute>(
y, x,
batch, channels, in_width, out_width,
kernel_size, stride, padding,
y_stride_batch, y_stride_channel, y_stride_width,
x_stride_batch, x_stride_channel, x_stride_width);
}
template <typename Tdata, typename Tcompute>
infiniStatus_t calculateAvgPool1d(
const AvgPool1dInfo &info,
int max_threads_per_block,
Tdata *y,
const Tdata *x,
musaStream_t stream) {
size_t total_elements = info.batch * info.channels * info.out_width;
int block_size = 256;
if (max_threads_per_block > 0 && max_threads_per_block < block_size) {
block_size = max_threads_per_block;
}
size_t grid_size = (total_elements + block_size - 1) / block_size;
if (grid_size > 65535) {
grid_size = 65535;
}
avgPool1dGlobalKernel<Tdata, Tcompute><<<grid_size, block_size, 0, stream>>>(
y, x,
info.batch, info.channels, info.in_width, info.out_width,
info.kernel_size, info.stride, info.padding,
info.y_stride_batch, info.y_stride_channel, info.y_stride_width,
info.x_stride_batch, info.x_stride_channel, info.x_stride_width);
return INFINI_STATUS_SUCCESS;
}
#define CALCULATE(TDATA, TCOMPUTE) \
calculateAvgPool1d<TDATA, TCOMPUTE>(\
_info,\
_opaque->internal->maxThreadsPerBlock(),\
(TDATA *)y,\
(const TDATA *)x,\
(musaStream_t)stream)
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *y,
const void *x,
void *stream) const {
(void)workspace;
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_info.dtype) {
case INFINI_DTYPE_F16:
return CALCULATE(half, float);
case INFINI_DTYPE_BF16:
return CALCULATE(cuda_bfloat16, float);
case INFINI_DTYPE_F32:
return CALCULATE(float, float);
case INFINI_DTYPE_F64:
return CALCULATE(double, double);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
#undef CALCULATE
} // namespace op::avg_pool1d::moore
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include "../cuda/kernel.cuh"
#include "avg_pool1d_nvidia.cuh"
template <typename T>
__global__ void avgPool1dGlobalKernel(
T *y,
const T *x,
size_t batch,
size_t channels,
size_t in_width,
size_t out_width,
size_t kernel_size,
size_t stride,
size_t padding,
ptrdiff_t y_stride_batch,
ptrdiff_t y_stride_channel,
ptrdiff_t y_stride_width,
ptrdiff_t x_stride_batch,
ptrdiff_t x_stride_channel,
ptrdiff_t x_stride_width) {
avgPool1dKernel<T>(
y, x,
batch, channels, in_width, out_width,
kernel_size, stride, padding,
y_stride_batch, y_stride_channel, y_stride_width,
x_stride_batch, x_stride_channel, x_stride_width);
}
namespace op::avg_pool1d::nvidia {
struct Descriptor::Opaque {
std::shared_ptr<device::nvidia::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
size_t kernel_size,
size_t stride,
size_t padding) {
auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding);
CHECK_RESULT(info);
*desc_ptr = new Descriptor(
info.take(),
0,
new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
handle->device,
handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <typename T>
infiniStatus_t calculateAvgPool1d(
const AvgPool1dInfo &info,
int max_threads_per_block,
T *y,
const T *x,
cudaStream_t stream) {
size_t total_elements = info.batch * info.channels * info.out_width;
int block_size = 256;
if (max_threads_per_block > 0 && max_threads_per_block < 256) {
block_size = max_threads_per_block;
}
size_t grid_size = (total_elements + block_size - 1) / block_size;
if (grid_size > 65535) {
grid_size = 65535;
}
avgPool1dGlobalKernel<T><<<grid_size, block_size, 0, stream>>>(
y, x,
info.batch, info.channels, info.in_width, info.out_width,
info.kernel_size, info.stride, info.padding,
info.y_stride_batch, info.y_stride_channel, info.y_stride_width,
info.x_stride_batch, info.x_stride_channel, info.x_stride_width);
return INFINI_STATUS_SUCCESS;
}
#define CALCULATE(TDATA) \
calculateAvgPool1d(_info, \
_opaque->internal->maxThreadsPerBlock(), \
(TDATA *)y, \
(const TDATA *)x, \
(cudaStream_t)stream)
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *y,
const void *x,
void *stream) const {
switch (_info.dtype) {
case INFINI_DTYPE_F16:
return CALCULATE(half);
case INFINI_DTYPE_BF16:
return CALCULATE(cuda_bfloat16);
case INFINI_DTYPE_F32:
return CALCULATE(float);
case INFINI_DTYPE_F64:
return CALCULATE(double);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
#undef CALCULATE
} // namespace op::avg_pool1d::nvidia
#ifndef __INFINIOP_AVG_POOL1D_CUDA_H__
#define __INFINIOP_AVG_POOL1D_CUDA_H__
#include "../avg_pool1d.h"
DESCRIPTOR(nvidia)
#endif
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/avg_pool1d.h"
#ifdef ENABLE_CPU_API
#include "cpu/avg_pool1d_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
#include "nvidia/avg_pool1d_nvidia.cuh"
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/avg_pool1d_ascend.h"
#endif
#ifdef ENABLE_CAMBRICON_API
#include "bang/avg_pool1d_bang.h"
#endif
#ifdef ENABLE_METAX_API
#include "metax/avg_pool1d_metax.h"
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/avg_pool1d_kunlun.h"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/avg_pool1d_moore.h"
#endif
__INFINI_C infiniStatus_t infiniopCreateAvgPool1dDescriptor(
infiniopHandle_t handle,
infiniopAvgPool1dDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x,
size_t kernel_size,
size_t stride,
size_t padding) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::avg_pool1d::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::avg_pool1d::NAMESPACE::Descriptor **>(desc_ptr), \
y, \
x, \
kernel_size, \
stride, \
padding)
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
CREATE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_MOORE_API
CREATE(INFINI_DEVICE_MOORE, moore);
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_ASCEND_API
CREATE(INFINI_DEVICE_ASCEND, ascend);
#endif
#ifdef ENABLE_KUNLUN_API
CREATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_CAMBRICON_API
CREATE(INFINI_DEVICE_CAMBRICON, bang);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CREATE
}
__INFINI_C infiniStatus_t infiniopGetAvgPool1dWorkspaceSize(infiniopAvgPool1dDescriptor_t desc,
size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<const op::avg_pool1d::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
GET(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_MOORE_API
GET(INFINI_DEVICE_MOORE, moore);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
GET(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_CAMBRICON_API
GET(INFINI_DEVICE_CAMBRICON, bang);
#endif
#ifdef ENABLE_ASCEND_API
GET(INFINI_DEVICE_ASCEND, ascend);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef GET
}
__INFINI_C infiniStatus_t infiniopAvgPool1d(
infiniopAvgPool1dDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::avg_pool1d::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, y, x, stream)
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
CALCULATE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE(INFINI_DEVICE_MOORE, moore);
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_CAMBRICON_API
CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
#endif
#ifdef ENABLE_ASCEND_API
CALCULATE(INFINI_DEVICE_ASCEND, ascend);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CALCULATE
}
__INFINI_C infiniStatus_t
infiniopDestroyAvgPool1dDescriptor(infiniopAvgPool1dDescriptor_t desc) {
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::avg_pool1d::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
DELETE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_HYGON_API
DELETE(INFINI_DEVICE_HYGON, nvidia);
#endif
#ifdef ENABLE_MOORE_API
DELETE(INFINI_DEVICE_MOORE, moore);
#endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
DELETE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_CAMBRICON_API
DELETE(INFINI_DEVICE_CAMBRICON, bang);
#endif
#ifdef ENABLE_ASCEND_API
DELETE(INFINI_DEVICE_ASCEND, ascend);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef DELETE
}
#include "cross_entropy_cpu.h"
#include "../../../devices/cpu/common_cpu.h"
#include "../../../reduce/cpu/reduce.h"
#include <algorithm>
#include <cmath>
namespace op::cross_entropy::cpu {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t target_desc) {
auto x_dtype = x_desc->dtype();
auto t_dtype = target_desc->dtype();
CHECK_DTYPE(x_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
CHECK_DTYPE(t_dtype, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
CrossEntropyInfo info{};
info.dtype = x_dtype;
info.target_dtype = t_dtype;
info.outer_size = target_desc->numel();
info.vocab_size = x_desc->shape().back();
info.x_stride = static_cast<ptrdiff_t>(info.vocab_size);
*desc_ptr = new Descriptor(nullptr, info, 0, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <typename T, typename Tidx>
infiniStatus_t cross_entropy_kernel(const CrossEntropyInfo *info,
T *y, const T *x, const void *target) {
const Tidx *label = reinterpret_cast<const Tidx *>(target);
#pragma omp parallel for
for (ptrdiff_t i = 0; i < ptrdiff_t(info->outer_size); ++i) {
const T *row = x + i * info->x_stride;
Tidx idx = label[i];
if (idx < 0 || static_cast<size_t>(idx) >= info->vocab_size) {
y[i] = utils::cast<T>(0.f);
continue;
}
float max_val = op::common_cpu::reduce_op::max(row, info->vocab_size, 1);
float sum_exp = 0.f;
for (size_t j = 0; j < info->vocab_size; ++j) {
sum_exp += std::exp(utils::cast<float>(row[j]) - max_val);
}
float log_term = std::log(sum_exp) + max_val;
float target_logit = utils::cast<float>(row[idx]);
y[i] = utils::cast<T>(log_term - target_logit);
}
return INFINI_STATUS_SUCCESS;
}
template <typename T>
infiniStatus_t dispatch_target_type(const CrossEntropyInfo *info,
T *y, const T *x, const void *target) {
if (info->target_dtype == INFINI_DTYPE_I32) {
return cross_entropy_kernel<T, int32_t>(info, y, x, target);
} else if (info->target_dtype == INFINI_DTYPE_I64) {
return cross_entropy_kernel<T, int64_t>(info, y, x, target);
}
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *y,
const void *x,
const void *target,
void *stream) const {
switch (_info.dtype) {
case INFINI_DTYPE_F16:
return dispatch_target_type(&_info, (fp16_t *)y, (const fp16_t *)x, target);
case INFINI_DTYPE_BF16:
return dispatch_target_type(&_info, (bf16_t *)y, (const bf16_t *)x, target);
case INFINI_DTYPE_F32:
return dispatch_target_type(&_info, (float *)y, (const float *)x, target);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
} // namespace op::cross_entropy::cpu
#ifndef __CROSS_ENTROPY_CPU_H__
#define __CROSS_ENTROPY_CPU_H__
#include "../cross_entropy.h"
DESCRIPTOR(cpu)
#endif
#ifndef CROSS_ENTROPY_H
#define CROSS_ENTROPY_H
#include "../../operator.h"
#include "info.h"
#define DESCRIPTOR(NAMESPACE) \
namespace op::cross_entropy::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
CrossEntropyInfo _info; \
size_t _workspace_size; \
\
Descriptor(Opaque *opaque, \
CrossEntropyInfo info, \
size_t workspace_size, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size) {} \
\
public: \
~Descriptor(); \
size_t workspaceSize() const { return _workspace_size; } \
static infiniStatus_t create(infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t y_desc, \
infiniopTensorDescriptor_t x_desc, \
infiniopTensorDescriptor_t target_desc); \
infiniStatus_t calculate(void *workspace, \
size_t workspace_size, \
void *y, \
const void *x, \
const void *target, \
void *stream) const; \
}; \
}
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment