Commit cb7f0b7d authored by wooway777's avatar wooway777
Browse files

Revert "Merge pull request #1056 from InfiniTensor/issue/1031"

This reverts commit 7f295448, reversing
changes made to e60985dc.
parent 037140c0
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/fmod.hpp"
namespace py = pybind11;
namespace infinicore::ops {
inline void bind_fmod(py::module &m) {
m.def("fmod",
&op::fmod,
py::arg("a"),
py::arg("b"),
R"doc(Element-wise floating point remainder of division of two tensors.)doc");
m.def("fmod_",
&op::fmod_,
py::arg("c"),
py::arg("a"),
py::arg("b"),
R"doc(In-place element-wise floating point remainder of division of two tensors.)doc");
}
} // namespace infinicore::ops
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/hardswish.hpp"
namespace py = pybind11;
namespace infinicore::ops {
inline void bind_hardswish(py::module &m) {
m.def("hardswish",
&op::hardswish,
py::arg("input"),
R"doc(Out-of-place Hardswish activation.)doc");
m.def("hardswish_",
&op::hardswish_,
py::arg("output"),
py::arg("input"),
R"doc(In-place Hardswish activation.)doc");
}
} // namespace infinicore::ops
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/hardtanh.hpp"
namespace py = pybind11;
namespace infinicore::ops {
inline void bind_hardtanh(py::module &m) {
m.def("hardtanh",
&op::hardtanh,
py::arg("input"),
py::arg("min_val") = -1.0f,
py::arg("max_val") = 1.0f,
R"doc(Apply the HardTanh activation.)doc");
m.def("hardtanh_",
&op::hardtanh_,
py::arg("output"),
py::arg("input"),
py::arg("min_val") = -1.0f,
py::arg("max_val") = 1.0f,
R"doc(In-place HardTanh activation.)doc");
}
} // namespace infinicore::ops
#ifndef ADAPTIVE_MAX_POOL1D_H
#define ADAPTIVE_MAX_POOL1D_H
#include "../../operator.h"
#include "info.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::adaptive_max_pool1d::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
AdaptiveMaxPool1dInfo _info; \
size_t _workspace_size; \
\
Descriptor( \
Opaque *opaque, \
AdaptiveMaxPool1dInfo info, \
size_t workspace_size, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size) {} \
\
public: \
~Descriptor(); \
\
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t y_desc, \
infiniopTensorDescriptor_t x_desc, \
size_t output_size); \
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
void *y, \
const void *x, \
void *stream) const; \
}; \
}
#endif // ADAPTIVE_MAX_POOL1D_H
#include "adaptive_max_pool1d_cpu.h"
#include "../../../devices/cpu/common_cpu.h"
#include "../../../reduce/cpu/reduce.h"
#include <algorithm>
#include <cmath>
namespace op::adaptive_max_pool1d::cpu {
Descriptor::~Descriptor() {}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
size_t output_size) {
auto result = AdaptiveMaxPool1dInfo::create(y_desc, x_desc, output_size);
CHECK_RESULT(result);
*desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <typename T>
infiniStatus_t adaptiveMaxPool1d(const AdaptiveMaxPool1dInfo *info, T *y, const T *x) {
const size_t ndim = info->ndim();
const size_t batch_size = info->shape[0];
const size_t channels = ndim > 2 ? info->shape[1] : 1;
const size_t input_length = info->input_length();
const size_t output_length = info->output_length();
// 计算总的任务块数 (Batch * Channels)
const ptrdiff_t total_blocks = static_cast<ptrdiff_t>(batch_size * channels);
const ptrdiff_t x_stride_last = info->x_strides.back();
#pragma omp parallel for
for (ptrdiff_t block_idx = 0; block_idx < total_blocks; ++block_idx) {
const size_t i = block_idx / channels; // batch index
const size_t j = block_idx % channels; // channel index
const T *x_ptr_base;
T *y_ptr_base;
if (ndim > 2) { // (N, C, L)
x_ptr_base = x + i * info->x_strides[0] + j * info->x_strides[1];
y_ptr_base = y + i * info->y_strides[0] + j * info->y_strides[1];
} else { // (N, L)
x_ptr_base = x + i * info->x_strides[0];
y_ptr_base = y + i * info->y_strides[0];
}
for (size_t out_idx = 0; out_idx < output_length; ++out_idx) {
size_t start_index = (out_idx * input_length) / output_length;
size_t end_index = ((out_idx + 1) * input_length + output_length - 1) / output_length;
start_index = std::max(start_index, size_t(0));
end_index = std::min(end_index, input_length);
size_t window_len = end_index - start_index;
if (window_len <= 0) {
continue;
}
const T *window_ptr = x_ptr_base + start_index * x_stride_last;
auto max_val = op::common_cpu::reduce_op::max(window_ptr, window_len, x_stride_last);
y_ptr_base[out_idx] = utils::cast<T>(max_val);
}
}
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace, size_t workspace_size,
void *y, const void *x,
void *stream) const {
if (_info.atype == INFINI_DTYPE_F32) {
return adaptiveMaxPool1d(&_info, (float *)y, (const float *)x);
} else if (_info.atype == INFINI_DTYPE_F16) {
return adaptiveMaxPool1d(&_info, (fp16_t *)y, (const fp16_t *)x);
} else if (_info.atype == INFINI_DTYPE_BF16) {
return adaptiveMaxPool1d(&_info, (bf16_t *)y, (const bf16_t *)x);
} else if (_info.atype == INFINI_DTYPE_F64) {
return adaptiveMaxPool1d(&_info, (double *)y, (const double *)x);
}
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
} // namespace op::adaptive_max_pool1d::cpu
#ifndef __ADAPTIVE_MAX_POOL1D_CPU_H__
#define __ADAPTIVE_MAX_POOL1D_CPU_H__
#include "../adaptive_max_pool1d.h"
DESCRIPTOR(cpu)
#endif
#ifndef __ADAPTIVE_MAX_POOL1D_CUDA_KERNEL_H__
#define __ADAPTIVE_MAX_POOL1D_CUDA_KERNEL_H__
#include <cmath>
#include <limits>
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
__device__ void adaptiveMaxPool1dBlock(
Tdata *__restrict__ y,
ptrdiff_t stride_y_batch,
ptrdiff_t stride_y_channel,
const Tdata *__restrict__ x,
ptrdiff_t stride_x_batch,
ptrdiff_t stride_x_channel,
ptrdiff_t stride_x_length,
size_t channels,
size_t input_length,
size_t output_length,
size_t ndim) {
size_t block_idx = blockIdx.x;
size_t batch_idx = block_idx / channels;
size_t channel_idx = block_idx % channels;
const Tdata *x_ptr;
Tdata *y_ptr;
if (ndim > 2) {
x_ptr = x + batch_idx * stride_x_batch + channel_idx * stride_x_channel;
y_ptr = y + batch_idx * stride_y_batch + channel_idx * stride_y_channel;
} else {
x_ptr = x + batch_idx * stride_x_batch;
y_ptr = y + batch_idx * stride_y_batch;
}
for (size_t out_idx = threadIdx.x; out_idx < output_length; out_idx += BLOCK_SIZE) {
int start_index = static_cast<int>(floorf((float)out_idx * input_length / output_length));
int end_index = static_cast<int>(ceilf((float)(out_idx + 1) * input_length / output_length));
if (end_index <= start_index) {
continue;
}
Tcompute max_val = Tcompute(x_ptr[start_index * stride_x_length]);
for (int i = start_index + 1; i < end_index; ++i) {
Tcompute val = Tcompute(x_ptr[i * stride_x_length]);
max_val = max(max_val, val);
}
y_ptr[out_idx] = Tdata(max_val);
}
}
#endif
#ifndef __ADAPATIVE_MAX_POOL1D_H__
#define __ADAPATIVE_MAX_POOL1D_H__
#include "../../../utils.h"
#include "../../tensor.h"
#include <vector>
namespace op::adaptive_max_pool1d {
class AdaptiveMaxPool1dInfo {
AdaptiveMaxPool1dInfo() = default;
public:
infiniDtype_t atype;
std::vector<size_t> shape;
std::vector<ptrdiff_t> y_strides;
std::vector<ptrdiff_t> x_strides;
size_t input_size;
size_t output_size;
size_t ndim() const { return shape.size(); }
size_t input_length() const { return input_size; }
size_t output_length() const { return output_size; }
static utils::Result<AdaptiveMaxPool1dInfo> create(
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
size_t output_size) {
auto atype = y_desc->dtype();
if (x_desc->dtype() != atype) {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
if (atype != INFINI_DTYPE_F16 && atype != INFINI_DTYPE_BF16 && atype != INFINI_DTYPE_F32 && atype != INFINI_DTYPE_F64) {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
const size_t y_ndim = y_desc->ndim();
const size_t x_ndim = x_desc->ndim();
if (y_ndim != x_ndim) {
return INFINI_STATUS_BAD_TENSOR_SHAPE;
}
for (size_t i = 0; i < y_ndim - 1; ++i) {
if (x_desc->dim(i) != y_desc->dim(i)) {
return INFINI_STATUS_BAD_TENSOR_SHAPE;
}
}
if (y_desc->dim(y_ndim - 1) != output_size) {
return INFINI_STATUS_BAD_TENSOR_SHAPE;
}
return utils::Result<AdaptiveMaxPool1dInfo>(AdaptiveMaxPool1dInfo{
atype,
y_desc->shape(),
y_desc->strides(),
x_desc->strides(),
x_desc->dim(x_ndim - 1),
output_size});
}
};
} // namespace op::adaptive_max_pool1d
#endif // __ADAPATIVE_MAX_POOL1D_H__
#ifndef __ADAPTIVE_MAX_POOL1D_METAX_CUH__
#define __ADAPTIVE_MAX_POOL1D_METAX_CUH__
#include "../adaptive_max_pool1d.h"
DESCRIPTOR(metax)
#endif
#include "../../../devices/metax/metax_common.h"
#include "adaptive_max_pool1d_metax.cuh"
#include "../../../devices/metax/metax_kernel_common.h"
#include "../cuda/kernel.cuh"
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
INFINIOP_METAX_KERNEL adaptiveMaxPool1dKernel(
Tdata *__restrict__ y,
ptrdiff_t stride_y_batch,
ptrdiff_t stride_y_channel,
const Tdata *__restrict__ x,
ptrdiff_t stride_x_batch,
ptrdiff_t stride_x_channel,
ptrdiff_t stride_x_length,
size_t channels,
size_t input_length,
size_t output_length,
size_t ndim) {
adaptiveMaxPool1dBlock<BLOCK_SIZE, Tdata, Tcompute>(
y, stride_y_batch, stride_y_channel,
x, stride_x_batch, stride_x_channel, stride_x_length,
channels, input_length, output_length,ndim);
}
namespace op::adaptive_max_pool1d::metax {
struct Descriptor::Opaque {
std::shared_ptr<device::metax::Handle::Internal> internal;
};
Descriptor::~Descriptor(){
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
size_t output_size) {
auto result = AdaptiveMaxPool1dInfo::create(y_desc, x_desc, output_size);
CHECK_RESULT(result);
auto info = result.take();
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
std::move(info),
0,
handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <unsigned int BLOCK_SIZE>
infiniStatus_t launchKernel(
uint32_t numblock,
void *y, infiniDtype_t dtype,
ptrdiff_t stride_y_batch, ptrdiff_t stride_y_channel,
const void *x,
ptrdiff_t stride_x_batch, ptrdiff_t stride_x_channel, ptrdiff_t stride_x_length,
size_t channels, size_t input_length, size_t output_length, size_t ndim,
hcStream_t stream){
#define LAUNCH_KERNEL(Tdata, Tcompute) \
adaptiveMaxPool1dKernel<BLOCK_SIZE, Tdata, Tcompute><<<numblock, BLOCK_SIZE, 0, stream>>> ( \
reinterpret_cast<Tdata *>(y), \
stride_y_batch, stride_y_channel, \
reinterpret_cast<const Tdata *>(x), \
stride_x_batch, stride_x_channel, stride_x_length, \
channels, input_length, output_length, ndim)
if (dtype == INFINI_DTYPE_F16) {
LAUNCH_KERNEL(half, float);
} else if (dtype == INFINI_DTYPE_BF16) {
LAUNCH_KERNEL(__hpcc_bfloat16, float);
} else if (dtype == INFINI_DTYPE_F32) {
LAUNCH_KERNEL(float, float);
} else if (dtype == INFINI_DTYPE_F64) {
LAUNCH_KERNEL(double, double);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
#undef LAUNCH_KERNEL
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace, size_t workspace_size,
void *y, const void *x,
void *stream_) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
const size_t ndim = _info.ndim();
const size_t batch_size = _info.shape[0];
const size_t channels = ndim > 2 ? _info.shape[1] : 1;
const size_t input_length = _info.input_length();
const size_t output_length = _info.output_length();
ptrdiff_t stride_x_batch = _info.x_strides[0];
ptrdiff_t stride_x_channel = ndim > 2 ? _info.x_strides[1] : 0;
ptrdiff_t stride_x_length = _info.x_strides.back();
ptrdiff_t stride_y_batch = _info.y_strides[0];
ptrdiff_t stride_y_channel = ndim > 2 ? _info.y_strides[1] : 0;
uint32_t num_blocks = static_cast<uint32_t>(batch_size * channels);
auto stream = reinterpret_cast<hcStream_t>(stream_);
if (_opaque->internal->maxThreadsPerBlock() >= METAX_BLOCK_SIZE_1024) {
CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_1024>(
num_blocks, y, _info.atype,
stride_y_batch, stride_y_channel,
x, stride_x_batch, stride_x_channel, stride_x_length,
channels, input_length, output_length, ndim,
stream));
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::adaptive_max_pool1d::metax
#ifndef __ADAPTIVE_MAX_POOL1D_MOOORE_H__
#define __ADAPTIVE_MAX_POOL1D_MOOORE_H__
#include "../adaptive_max_pool1d.h"
DESCRIPTOR(moore)
#endif
#include "../../../devices/moore/moore_common.h"
#include "adaptive_max_pool1d_moore.h"
#include "../../../devices/moore/moore_kernel_common.h"
#include "../cuda/kernel.cuh"
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
INFINIOP_MOORE_KERNEL adaptiveMaxPool1dKernel(
Tdata *__restrict__ y,
ptrdiff_t stride_y_batch,
ptrdiff_t stride_y_channel,
const Tdata *__restrict__ x,
ptrdiff_t stride_x_batch,
ptrdiff_t stride_x_channel,
ptrdiff_t stride_x_length,
size_t channels,
size_t input_length,
size_t output_length,
size_t ndim){
adaptiveMaxPool1dBlock<BLOCK_SIZE, Tdata, Tcompute>(
y, stride_y_batch, stride_y_channel,
x, stride_x_batch, stride_x_channel, stride_x_length,
channels, input_length, output_length, ndim);
}
namespace op::adaptive_max_pool1d::moore {
struct Descriptor::Opaque {
std::shared_ptr<device::moore::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
size_t output_size) {
auto result = AdaptiveMaxPool1dInfo::create(y_desc, x_desc, output_size);
CHECK_RESULT(result);
auto info = result.take();
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
std::move(info),
0,
handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <unsigned int BLOCK_SIZE>
infiniStatus_t launchKernel(
uint32_t num_blocks,
void *y, infiniDtype_t dtype,
ptrdiff_t stride_y_batch, ptrdiff_t stride_y_channel,
const void *x,
ptrdiff_t stride_x_batch, ptrdiff_t stride_x_channel, ptrdiff_t stride_x_length,
size_t channels, size_t input_length, size_t output_length, size_t ndim,
musaStream_t musa_stream) {
#define LAUNCH_KERNEL(Tdata, Tcompute) \
adaptiveMaxPool1dKernel<BLOCK_SIZE, Tdata, Tcompute><<<num_blocks, BLOCK_SIZE, 0, musa_stream>>>( \
reinterpret_cast<Tdata *>(y), \
stride_y_batch, stride_y_channel, \
reinterpret_cast<const Tdata *>(x), \
stride_x_batch, stride_x_channel, stride_x_length, \
channels, input_length, output_length, ndim)
if (dtype == INFINI_DTYPE_F16) {
LAUNCH_KERNEL(half, float);
} else if (dtype == INFINI_DTYPE_BF16) {
LAUNCH_KERNEL(__mt_bfloat16, float);
} else if (dtype == INFINI_DTYPE_F32) {
LAUNCH_KERNEL(float, float);
} else if (dtype == INFINI_DTYPE_F64) {
LAUNCH_KERNEL(double, double);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
#undef LAUNCH_KERNEL
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace, size_t workspace_size,
void *y, const void *x,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
const size_t ndim = _info.ndim();
const size_t batch_size = _info.shape[0];
const size_t channels = ndim > 2 ? _info.shape[1] : 1;
const size_t input_length = _info.input_length();
const size_t output_length = _info.output_length();
ptrdiff_t stride_x_batch = _info.x_strides[0];
ptrdiff_t stride_x_channel = ndim > 2 ? _info.x_strides[1] : 0;
ptrdiff_t stride_x_length = _info.x_strides.back();
ptrdiff_t stride_y_batch = _info.y_strides[0];
ptrdiff_t stride_y_channel = ndim > 2 ? _info.y_strides[1] : 0;
uint32_t num_blocks = static_cast<uint32_t>(batch_size * channels);
auto musa_stream = reinterpret_cast<musaStream_t>(stream);
if (_opaque->internal->maxThreadsPerBlock() >= MOORE_BLOCK_SIZE_1024) {
CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_1024>(
num_blocks, y, _info.atype,
stride_y_batch, stride_y_channel,
x, stride_x_batch, stride_x_channel, stride_x_length,
channels, input_length, output_length, ndim,
musa_stream));
} else if (_opaque->internal->maxThreadsPerBlock() >= MOORE_BLOCK_SIZE_512) {
CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_512>(
num_blocks, y, _info.atype,
stride_y_batch, stride_y_channel,
x, stride_x_batch, stride_x_channel, stride_x_length,
channels, input_length, output_length, ndim,
musa_stream));
} else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_2048) {
CHECK_STATUS(launchKernel<MOORE_BLOCK_SIZE_2048>(
num_blocks, y, _info.atype,
stride_y_batch, stride_y_channel,
x, stride_x_batch, stride_x_channel, stride_x_length,
channels, input_length, output_length, ndim,
musa_stream));
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::adaptive_max_pool1d::moore
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "adaptive_max_pool1d_nvidia.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include "../cuda/kernel.cuh"
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
INFINIOP_CUDA_KERNEL adaptiveMaxPool1dKernel(
Tdata *__restrict__ y,
ptrdiff_t stride_y_batch,
ptrdiff_t stride_y_channel,
const Tdata *__restrict__ x,
ptrdiff_t stride_x_batch,
ptrdiff_t stride_x_channel,
ptrdiff_t stride_x_length,
size_t channels,
size_t input_length,
size_t output_length,
size_t ndim) {
adaptiveMaxPool1dBlock<BLOCK_SIZE, Tdata, Tcompute>(
y, stride_y_batch, stride_y_channel,
x, stride_x_batch, stride_x_channel, stride_x_length,
channels, input_length, output_length, ndim);
}
namespace op::adaptive_max_pool1d::nvidia {
struct Descriptor::Opaque {
std::shared_ptr<device::nvidia::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
size_t output_size) {
auto result = AdaptiveMaxPool1dInfo::create(y_desc, x_desc, output_size);
CHECK_RESULT(result);
auto info = result.take();
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
std::move(info),
0,
handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <unsigned int BLOCK_SIZE>
infiniStatus_t launchKernel(
uint32_t num_blocks,
void *y, infiniDtype_t dtype,
ptrdiff_t stride_y_batch, ptrdiff_t stride_y_channel,
const void *x,
ptrdiff_t stride_x_batch, ptrdiff_t stride_x_channel, ptrdiff_t stride_x_length,
size_t channels, size_t input_length, size_t output_length, size_t ndim,
cudaStream_t cuda_stream) {
#define LAUNCH_KERNEL(Tdata, Tcompute) \
adaptiveMaxPool1dKernel<BLOCK_SIZE, Tdata, Tcompute><<<num_blocks, BLOCK_SIZE, 0, cuda_stream>>>( \
reinterpret_cast<Tdata *>(y), \
stride_y_batch, stride_y_channel, \
reinterpret_cast<const Tdata *>(x), \
stride_x_batch, stride_x_channel, stride_x_length, \
channels, input_length, output_length, ndim)
if (dtype == INFINI_DTYPE_F16) {
LAUNCH_KERNEL(half, float);
} else if (dtype == INFINI_DTYPE_BF16) {
LAUNCH_KERNEL(__nv_bfloat16, float);
} else if (dtype == INFINI_DTYPE_F32) {
LAUNCH_KERNEL(float, float);
} else if (dtype == INFINI_DTYPE_F64) {
LAUNCH_KERNEL(double, double);
} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
#undef LAUNCH_KERNEL
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace, size_t workspace_size,
void *y, const void *x,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
const size_t ndim = _info.ndim();
const size_t batch_size = _info.shape[0];
const size_t channels = ndim > 2 ? _info.shape[1] : 1;
const size_t input_length = _info.input_length();
const size_t output_length = _info.output_length();
ptrdiff_t stride_x_batch = _info.x_strides[0];
ptrdiff_t stride_x_channel = ndim > 2 ? _info.x_strides[1] : 0;
ptrdiff_t stride_x_length = _info.x_strides.back();
ptrdiff_t stride_y_batch = _info.y_strides[0];
ptrdiff_t stride_y_channel = ndim > 2 ? _info.y_strides[1] : 0;
uint32_t num_blocks = static_cast<uint32_t>(batch_size * channels);
auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
if (_opaque->internal->maxThreadsPerBlock() >= CUDA_BLOCK_SIZE_1024) {
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_1024>(
num_blocks, y, _info.atype,
stride_y_batch, stride_y_channel,
x, stride_x_batch, stride_x_channel, stride_x_length,
channels, input_length, output_length, ndim,
cuda_stream));
} else if (_opaque->internal->maxThreadsPerBlock() >= CUDA_BLOCK_SIZE_512) {
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_512>(
num_blocks, y, _info.atype,
stride_y_batch, stride_y_channel,
x, stride_x_batch, stride_x_channel, stride_x_length,
channels, input_length, output_length, ndim,
cuda_stream));
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
num_blocks, y, _info.atype,
stride_y_batch, stride_y_channel,
x, stride_x_batch, stride_x_channel, stride_x_length,
channels, input_length, output_length, ndim,
cuda_stream));
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::adaptive_max_pool1d::nvidia
#ifndef __ADAPTIVE_MAX_POOL1D_CUDA_H__
#define __ADAPTIVE_MAX_POOL1D_CUDA_H__
#include "../adaptive_max_pool1d.h"
DESCRIPTOR(nvidia)
#endif
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/adaptive_max_pool1d.h"
#ifdef ENABLE_CPU_API
#include "cpu/adaptive_max_pool1d_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#include "nvidia/adaptive_max_pool1d_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
#include "metax/adaptive_max_pool1d_metax.cuh"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/adaptive_max_pool1d_moore.h"
#endif
__INFINI_C infiniStatus_t infiniopCreateAdaptiveMaxPool1dDescriptor(
infiniopHandle_t handle,
infiniopAdaptiveMaxPool1dDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
size_t output_size) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::adaptive_max_pool1d::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::adaptive_max_pool1d::NAMESPACE::Descriptor **>(desc_ptr), \
y_desc, \
x_desc, \
output_size)
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_MOORE_API
CREATE(INFINI_DEVICE_MOORE, moore);
#endif
}
#undef CREATE
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__INFINI_C infiniStatus_t infiniopGetAdaptiveMaxPool1dWorkspaceSize(
infiniopAdaptiveMaxPool1dDescriptor_t desc,
size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::adaptive_max_pool1d::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_MOORE_API
GET(INFINI_DEVICE_MOORE, moore);
#endif
}
#undef GET
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__INFINI_C infiniStatus_t infiniopAdaptiveMaxPool1d(
infiniopAdaptiveMaxPool1dDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<op::adaptive_max_pool1d::NAMESPACE::Descriptor *>(desc)->calculate( \
workspace, workspace_size, y, x, stream);
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE(INFINI_DEVICE_MOORE, moore);
#endif
}
#undef CALCULATE
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__INFINI_C infiniStatus_t infiniopDestroyAdaptiveMaxPool1dDescriptor(
infiniopAdaptiveMaxPool1dDescriptor_t desc) {
#define DESTROY(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<op::adaptive_max_pool1d::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
DESTROY(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
DESTROY(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_METAX_API
DESTROY(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_MOORE_API
DESTROY(INFINI_DEVICE_MOORE, moore);
#endif
}
#undef DESTROY
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#include "asinh_cpu.h"
namespace op::asinh::cpu {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &x_desc = input_desc_vec.at(0);
const auto &y_shape = out_desc->shape();
const auto &x_shape = x_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(y_shape, x_shape);
CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<AsinhOp, fp16_t>(_info, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<AsinhOp, float>(_info, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<AsinhOp, double>(_info, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<AsinhOp, bf16_t>(_info, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
} // namespace op::asinh::cpu
#ifndef __ASINH_CPU_H__
#define __ASINH_CPU_H__
#include <cmath>
#include "../../../elementwise/cpu/elementwise_cpu.h"
ELEMENTWISE_DESCRIPTOR(asinh, cpu)
namespace op::asinh::cpu {
typedef struct AsinhOp {
public:
static constexpr size_t num_inputs = 1;
template <typename T>
T operator()(const T &x) const {
return std::asinh(x);
}
} AsinhOp;
} // namespace op::asinh::cpu
#endif // __ASINH_CPU_H__
#ifndef __ASINH_CUDA_KERNEL_H__
#define __ASINH_CUDA_KERNEL_H__
namespace op::asinh::cuda {
typedef struct AsinhOp {
public:
static constexpr size_t num_inputs = 1;
template <typename T>
__device__ __forceinline__ T operator()(const T &x) const {
if constexpr (std::is_same_v<T, half>) {
float x_f = __half2float(x);
return __float2half(asinhf(x_f));
} else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
float x_f = __bfloat162float(x);
return __float2bfloat16(asinhf(x_f));
} else if constexpr (std::is_same_v<T, float>) {
return asinhf(x);
} else {
return ::asinh(x);
}
}
} AsinhOp;
} // namespace op::asinh::cuda
#endif // __ASINH_CUDA_KERNEL_H__
#include "asinh_metax.h"
#include "../../../elementwise/metax/elementwise_metax.h"
#include "../cuda/kernel.cuh"
namespace op::asinh::metax {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &x_desc = input_desc_vec.at(0);
const auto &y_shape = out_desc->shape();
const auto &x_shape = x_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(y_shape, x_shape);
// create CUDA elementwise descriptor
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::AsinhOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::AsinhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::AsinhOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::AsinhOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
} // namespace op::asinh::metax
#ifndef __ASINH_METAX_API_H__
#define __ASINH_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR(asinh, metax)
#endif // __ASINH_METAX_API_H__
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment