"nndet/csrc/ops.cpp" did not exist on "aeb830308e135c27766931e19c846021644838c1"
Unverified Commit d6af9c90 authored by PanZezhong1725's avatar PanZezhong1725 Committed by GitHub
Browse files

issue/1031 T1-1-17

parent af0a1475
#ifndef __HARDSWISH_MOORE_KERNEL_H__
#define __HARDSWISH_MOORE_KERNEL_H__
#include <cmath>
#include <type_traits>
namespace op::hardswish::moore {
typedef struct HardSwishOp {
public:
static constexpr size_t num_inputs = 1;
template <typename T>
__device__ __forceinline__ T operator()(const T &x) const {
if constexpr (std::is_same_v<T, half>) {
float x_f = __half2float(x);
float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f);
return __float2half(x_f * val * 0.16666667f);
} else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
float x_f = __bfloat162float(x);
float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f);
return __float2bfloat16_rn(x_f * val * 0.16666667f);
} else if constexpr (std::is_same_v<T, float>) {
float val = fminf(fmaxf(x + 3.0f, 0.0f), 6.0f);
return x * val * 0.16666667f;
} else if constexpr (std::is_same_v<T, double>) {
double val = fmin(fmax(x + 3.0, 0.0), 6.0);
return x * val * (1.0 / 6.0);
} else {
float x_f = static_cast<float>(x);
float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f);
return static_cast<T>(x_f * val * 0.16666667f);
}
}
} HardSwishOp;
} // namespace op::hardswish::moore
#endif // __HARDSWISH_MOORE_KERNEL_H__
#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
#include "../cuda/kernel.cuh"
#include "hardswish_nvidia.cuh"
#include <cuda_runtime.h>
namespace op::hardswish::nvidia {
namespace {
inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) {
return info.isOutputContiguous() && info.getInputSize() == 1 && info.getInputContiguous()[0] && !info.getInputBroadcasted()[0];
}
template <typename T>
__global__ void hardswish_contiguous_kernel(size_t numel, T *out, const T *in) {
const auto op = op::hardswish::cuda::HardSwishOp{};
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
while (idx < numel) {
out[idx] = op(in[idx]);
idx += blockDim.x * gridDim.x;
}
}
template <typename T>
infiniStatus_t launch_fast_path(size_t numel,
void *output,
const std::vector<const void *> &inputs,
void *stream) {
if (numel == 0) {
return INFINI_STATUS_SUCCESS;
}
constexpr int BLOCK_SIZE = 256;
int grid = static_cast<int>((numel + BLOCK_SIZE - 1) / BLOCK_SIZE);
grid = std::min(grid, 65535);
auto *out_ptr = reinterpret_cast<T *>(output);
auto *in_ptr = reinterpret_cast<const T *>(inputs[0]);
auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
hardswish_contiguous_kernel<<<grid, BLOCK_SIZE, 0, cuda_stream>>>(numel, out_ptr, in_ptr);
cudaError_t err = cudaGetLastError();
return err == cudaSuccess ? INFINI_STATUS_SUCCESS : INFINI_STATUS_INTERNAL_ERROR;
}
} // namespace
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &input_desc = input_desc_vec.at(0);
const auto &output_shape = out_desc->shape();
const auto &input_shape = input_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_SAME_SHAPE(output_shape, input_shape);
CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
const bool fast_path = can_use_contiguous_fast_path(_info);
if (fast_path) {
switch (_dtype) {
case INFINI_DTYPE_BF16:
return launch_fast_path<cuda_bfloat16>(_info.getOutputSize(), output, inputs, stream);
case INFINI_DTYPE_F16:
return launch_fast_path<half>(_info.getOutputSize(), output, inputs, stream);
case INFINI_DTYPE_F32:
return launch_fast_path<float>(_info.getOutputSize(), output, inputs, stream);
case INFINI_DTYPE_F64:
return launch_fast_path<double>(_info.getOutputSize(), output, inputs, stream);
default:
break;
}
}
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::HardSwishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::HardSwishOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::HardSwishOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::HardSwishOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::hardswish::nvidia
#ifndef __HARDSWISH_CUDA_API_H__
#define __HARDSWISH_CUDA_API_H__
#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
ELEMENTWISE_DESCRIPTOR(hardswish, nvidia)
#endif
\ No newline at end of file
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/hardswish.h"
#ifdef ENABLE_CPU_API
#include "cpu/hardswish_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#include "nvidia/hardswish_nvidia.cuh"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/hardswish_moore.h"
#endif
#ifdef ENABLE_METAX_API
#include "metax/hardswish_metax.h"
#endif
__INFINI_C infiniStatus_t infiniopCreateHardSwishDescriptor(
infiniopHandle_t handle,
infiniopHardSwishDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::hardswish::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::hardswish::NAMESPACE::Descriptor **>(desc_ptr), \
output_desc, \
{input_desc})
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_MOORE_API
CREATE(INFINI_DEVICE_MOORE, moore);
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CREATE
}
__INFINI_C infiniStatus_t infiniopGetHardSwishWorkspaceSize(infiniopHardSwishDescriptor_t desc, size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::hardswish::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_MOORE_API
GET(INFINI_DEVICE_MOORE, moore);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef GET
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__INFINI_C infiniStatus_t infiniopHardSwish(
infiniopHardSwishDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *output,
const void *input,
void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::hardswish::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, output, {input}, stream)
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE(INFINI_DEVICE_MOORE, moore);
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CALCULATE
}
__INFINI_C infiniStatus_t infiniopDestroyHardSwishDescriptor(infiniopHardSwishDescriptor_t desc) {
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::hardswish::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_MOORE_API
DELETE(INFINI_DEVICE_MOORE, moore);
#endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, metax);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef DELETE
}
#include "hardtanh_cpu.h"
#include <type_traits>
namespace op::hardtanh::cpu {
Descriptor::Descriptor(infiniDtype_t dtype,
op::elementwise::ElementwiseInfo info,
size_t workspace_size,
infiniDevice_t device_type,
int device_id,
float min_val,
float max_val)
: InfiniopDescriptor{device_type, device_id},
_dtype(dtype),
_info(std::move(info)),
_workspace_size(workspace_size),
_min_val(min_val),
_max_val(max_val) {}
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec,
float min_val,
float max_val) {
auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &input_desc = input_desc_vec.at(0);
const auto &output_shape = out_desc->shape();
const auto &input_shape = input_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_SAME_SHAPE(output_shape, input_shape);
auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec);
CHECK_RESULT(info_result);
*desc_ptr = new Descriptor(
dtype,
info_result.take(),
0,
handle->device,
handle->device_id,
min_val,
max_val);
return INFINI_STATUS_SUCCESS;
}
template <typename T>
static infiniStatus_t launchCpuHardTanh(const op::elementwise::ElementwiseInfo &info,
void *output,
const std::vector<const void *> &inputs,
float min_val,
float max_val) {
if (inputs.empty()) {
return INFINI_STATUS_BAD_PARAM;
}
T *out = reinterpret_cast<T *>(output);
const T *in = reinterpret_cast<const T *>(inputs[0]);
const auto ndim = info.getNdim();
const auto *output_shape = info.getOutputShape();
const auto *output_strides = info.getOutputStrides();
const auto *input_shape = info.getInputShape(0);
const auto *input_strides = info.getInputStrides(0);
const auto *input_contiguous = info.getInputContiguous();
ptrdiff_t output_size = info.getOutputSize();
#pragma omp parallel for if (output_size > 1024)
for (ptrdiff_t i = 0; i < output_size; ++i) {
const size_t out_idx = info.isOutputContiguous()
? static_cast<size_t>(i)
: op::common_cpu::indexToOffset(i, ndim, output_shape, output_strides);
const size_t in_idx = input_contiguous[0]
? static_cast<size_t>(i)
: op::common_cpu::indexToOffset(i, ndim, input_shape, input_strides);
if constexpr (std::is_same_v<T, fp16_t> || std::is_same_v<T, bf16_t>) {
float value = utils::cast<float>(in[in_idx]);
float clamped = HardTanhOp{}(value, min_val, max_val);
out[out_idx] = utils::cast<T>(clamped);
} else {
out[out_idx] = HardTanhOp{}(in[in_idx], min_val, max_val);
}
}
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
(void)workspace;
(void)workspace_size;
(void)stream;
if (inputs.size() != 1) {
return INFINI_STATUS_BAD_PARAM;
}
switch (_dtype) {
case INFINI_DTYPE_BF16:
return launchCpuHardTanh<bf16_t>(_info, output, inputs, _min_val, _max_val);
case INFINI_DTYPE_F16:
return launchCpuHardTanh<fp16_t>(_info, output, inputs, _min_val, _max_val);
case INFINI_DTYPE_F32:
return launchCpuHardTanh<float>(_info, output, inputs, _min_val, _max_val);
case INFINI_DTYPE_F64:
return launchCpuHardTanh<double>(_info, output, inputs, _min_val, _max_val);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
} // namespace op::hardtanh::cpu
#ifndef __HARDTANH_CPU_H__
#define __HARDTANH_CPU_H__
#include "../../../elementwise/cpu/elementwise_cpu.h"
#include <algorithm>
namespace op::hardtanh::cpu {
class Descriptor final : public InfiniopDescriptor {
infiniDtype_t _dtype;
op::elementwise::ElementwiseInfo _info;
size_t _workspace_size;
float _min_val;
float _max_val;
Descriptor(infiniDtype_t dtype,
op::elementwise::ElementwiseInfo info,
size_t workspace_size,
infiniDevice_t device_type,
int device_id,
float min_val,
float max_val);
public:
~Descriptor();
size_t workspaceSize() const { return _workspace_size; }
static infiniStatus_t create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec,
float min_val,
float max_val);
infiniStatus_t calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const;
float minVal() const { return _min_val; }
float maxVal() const { return _max_val; }
};
typedef struct HardTanhOp {
public:
static constexpr size_t num_inputs = 1;
template <typename T>
T operator()(const T &x, float min_val, float max_val) const {
T low = static_cast<T>(min_val);
T high = static_cast<T>(max_val);
T val = x < low ? low : x;
return val > high ? high : val;
}
} HardTanhOp;
} // namespace op::hardtanh::cpu
#endif
#ifndef __HARDTANH_CUDA_H__
#define __HARDTANH_CUDA_H__
#if defined(__MACACC__)
#include <maca_bfloat16.h>
#include <maca_fp16.h>
#else
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#endif
#include <type_traits>
namespace op::hardtanh::cuda {
typedef struct HardTanhOp {
public:
static constexpr size_t num_inputs = 1;
template <typename T>
__device__ __forceinline__ T operator()(const T &x, float min_val, float max_val) const {
if constexpr (std::is_same_v<T, half2>) {
float2 x_f2 = __half22float2(x);
x_f2.x = fminf(max_val, fmaxf(min_val, x_f2.x));
x_f2.y = fminf(max_val, fmaxf(min_val, x_f2.y));
return __float22half2_rn(x_f2);
} else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
float x_f = __bfloat162float(x);
return __float2bfloat16(fminf(max_val, fmaxf(min_val, x_f)));
} else if constexpr (std::is_same_v<T, half>) {
float x_f = __half2float(x);
return __float2half(fminf(max_val, fmaxf(min_val, x_f)));
} else if constexpr (std::is_same_v<T, float>) {
return fminf(max_val, fmaxf(min_val, x));
} else if constexpr (std::is_same_v<T, double>) {
return fmin((double)max_val, fmax((double)min_val, x));
}
}
} HardTanhOp;
} // namespace op::hardtanh::cuda
#endif
#ifndef __HARDTANH_METAX_API_H__
#define __HARDTANH_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax_api.h"
namespace op::hardtanh::metax {
class Descriptor final : public InfiniopDescriptor {
infiniDtype_t _dtype;
op::elementwise::ElementwiseInfo _info;
std::unique_ptr<op::elementwise::metax::DeviceImpl> _device_info;
size_t _workspace_size;
float _min_val;
float _max_val;
Descriptor(infiniDtype_t dtype,
op::elementwise::ElementwiseInfo info,
op::elementwise::metax::DeviceImpl *device_info,
size_t workspace_size,
infiniDevice_t device_type,
int device_id,
float min_val,
float max_val);
public:
~Descriptor();
size_t workspaceSize() const { return _workspace_size; }
static infiniStatus_t create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec,
float min_val,
float max_val);
infiniStatus_t calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const;
};
} // namespace op::hardtanh::metax
#endif // __HARDTANH_METAX_API_H__
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#ifndef __HARDTANH_MOORE_KERNEL_H__
#define __HARDTANH_MOORE_KERNEL_H__
#include <cmath>
#include <type_traits>
namespace op::hardtanh::moore {
typedef struct HardTanhOp {
public:
static constexpr size_t num_inputs = 1;
template <typename T>
__device__ __forceinline__ T operator()(const T &x, float min_val, float max_val) const {
if constexpr (std::is_same_v<T, half>) {
float x_f = __half2float(x);
return __float2half(fminf(max_val, fmaxf(min_val, x_f)));
} else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
float x_f = __bfloat162float(x);
return __float2bfloat16_rn(fminf(max_val, fmaxf(min_val, x_f)));
} else if constexpr (std::is_same_v<T, float>) {
return fminf(max_val, fmaxf(min_val, x));
} else if constexpr (std::is_same_v<T, double>) {
return fmin((double)max_val, fmax((double)min_val, x));
} else {
float x_f = static_cast<float>(x);
return static_cast<T>(fminf(max_val, fmaxf(min_val, x_f)));
}
}
} HardTanhOp;
} // namespace op::hardtanh::moore
#endif // __HARDTANH_MOORE_KERNEL_H__
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment