Commit def22a08 authored by wooway777's avatar wooway777
Browse files

Revert "Merge pull request #1056 from InfiniTensor/issue/1031"

This reverts commit 7f295448, reversing
changes made to e60985dc.
parent 1795b38a
#ifndef __HARDSWISH_MOORE_KERNEL_H__
#define __HARDSWISH_MOORE_KERNEL_H__
#include <cmath>
#include <type_traits>
namespace op::hardswish::moore {
typedef struct HardSwishOp {
public:
static constexpr size_t num_inputs = 1;
template <typename T>
__device__ __forceinline__ T operator()(const T &x) const {
if constexpr (std::is_same_v<T, half>) {
float x_f = __half2float(x);
float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f);
return __float2half(x_f * val * 0.16666667f);
} else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
float x_f = __bfloat162float(x);
float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f);
return __float2bfloat16_rn(x_f * val * 0.16666667f);
} else if constexpr (std::is_same_v<T, float>) {
float val = fminf(fmaxf(x + 3.0f, 0.0f), 6.0f);
return x * val * 0.16666667f;
} else if constexpr (std::is_same_v<T, double>) {
double val = fmin(fmax(x + 3.0, 0.0), 6.0);
return x * val * (1.0 / 6.0);
} else {
float x_f = static_cast<float>(x);
float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f);
return static_cast<T>(x_f * val * 0.16666667f);
}
}
} HardSwishOp;
} // namespace op::hardswish::moore
#endif // __HARDSWISH_MOORE_KERNEL_H__
#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
#include "../cuda/kernel.cuh"
#include "hardswish_nvidia.cuh"
#include <cuda_runtime.h>
namespace op::hardswish::nvidia {
namespace {
inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) {
return info.isOutputContiguous() && info.getInputSize() == 1 && info.getInputContiguous()[0] && !info.getInputBroadcasted()[0];
}
template <typename T>
__global__ void hardswish_contiguous_kernel(size_t numel, T *out, const T *in) {
const auto op = op::hardswish::cuda::HardSwishOp{};
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
while (idx < numel) {
out[idx] = op(in[idx]);
idx += blockDim.x * gridDim.x;
}
}
template <typename T>
infiniStatus_t launch_fast_path(size_t numel,
void *output,
const std::vector<const void *> &inputs,
void *stream) {
if (numel == 0) {
return INFINI_STATUS_SUCCESS;
}
constexpr int BLOCK_SIZE = 256;
int grid = static_cast<int>((numel + BLOCK_SIZE - 1) / BLOCK_SIZE);
grid = std::min(grid, 65535);
auto *out_ptr = reinterpret_cast<T *>(output);
auto *in_ptr = reinterpret_cast<const T *>(inputs[0]);
auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
hardswish_contiguous_kernel<<<grid, BLOCK_SIZE, 0, cuda_stream>>>(numel, out_ptr, in_ptr);
cudaError_t err = cudaGetLastError();
return err == cudaSuccess ? INFINI_STATUS_SUCCESS : INFINI_STATUS_INTERNAL_ERROR;
}
} // namespace
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &input_desc = input_desc_vec.at(0);
const auto &output_shape = out_desc->shape();
const auto &input_shape = input_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_SAME_SHAPE(output_shape, input_shape);
CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
const bool fast_path = can_use_contiguous_fast_path(_info);
if (fast_path) {
switch (_dtype) {
case INFINI_DTYPE_BF16:
return launch_fast_path<cuda_bfloat16>(_info.getOutputSize(), output, inputs, stream);
case INFINI_DTYPE_F16:
return launch_fast_path<half>(_info.getOutputSize(), output, inputs, stream);
case INFINI_DTYPE_F32:
return launch_fast_path<float>(_info.getOutputSize(), output, inputs, stream);
case INFINI_DTYPE_F64:
return launch_fast_path<double>(_info.getOutputSize(), output, inputs, stream);
default:
break;
}
}
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::HardSwishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::HardSwishOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::HardSwishOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::HardSwishOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::hardswish::nvidia
#ifndef __HARDSWISH_CUDA_API_H__
#define __HARDSWISH_CUDA_API_H__
#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
ELEMENTWISE_DESCRIPTOR(hardswish, nvidia)
#endif
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/hardswish.h"
#ifdef ENABLE_CPU_API
#include "cpu/hardswish_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#include "nvidia/hardswish_nvidia.cuh"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/hardswish_moore.h"
#endif
#ifdef ENABLE_METAX_API
#include "metax/hardswish_metax.h"
#endif
__INFINI_C infiniStatus_t infiniopCreateHardSwishDescriptor(
infiniopHandle_t handle,
infiniopHardSwishDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::hardswish::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::hardswish::NAMESPACE::Descriptor **>(desc_ptr), \
output_desc, \
{input_desc})
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_MOORE_API
CREATE(INFINI_DEVICE_MOORE, moore);
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CREATE
}
__INFINI_C infiniStatus_t infiniopGetHardSwishWorkspaceSize(infiniopHardSwishDescriptor_t desc, size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::hardswish::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_MOORE_API
GET(INFINI_DEVICE_MOORE, moore);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef GET
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__INFINI_C infiniStatus_t infiniopHardSwish(
infiniopHardSwishDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *output,
const void *input,
void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::hardswish::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, output, {input}, stream)
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE(INFINI_DEVICE_MOORE, moore);
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CALCULATE
}
__INFINI_C infiniStatus_t infiniopDestroyHardSwishDescriptor(infiniopHardSwishDescriptor_t desc) {
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::hardswish::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_MOORE_API
DELETE(INFINI_DEVICE_MOORE, moore);
#endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, metax);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef DELETE
}
#include "hardtanh_cpu.h"
#include <type_traits>
namespace op::hardtanh::cpu {
Descriptor::Descriptor(infiniDtype_t dtype,
op::elementwise::ElementwiseInfo info,
size_t workspace_size,
infiniDevice_t device_type,
int device_id,
float min_val,
float max_val)
: InfiniopDescriptor{device_type, device_id},
_dtype(dtype),
_info(std::move(info)),
_workspace_size(workspace_size),
_min_val(min_val),
_max_val(max_val) {}
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec,
float min_val,
float max_val) {
auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &input_desc = input_desc_vec.at(0);
const auto &output_shape = out_desc->shape();
const auto &input_shape = input_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_SAME_SHAPE(output_shape, input_shape);
auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec);
CHECK_RESULT(info_result);
*desc_ptr = new Descriptor(
dtype,
info_result.take(),
0,
handle->device,
handle->device_id,
min_val,
max_val);
return INFINI_STATUS_SUCCESS;
}
template <typename T>
static infiniStatus_t launchCpuHardTanh(const op::elementwise::ElementwiseInfo &info,
void *output,
const std::vector<const void *> &inputs,
float min_val,
float max_val) {
if (inputs.empty()) {
return INFINI_STATUS_BAD_PARAM;
}
T *out = reinterpret_cast<T *>(output);
const T *in = reinterpret_cast<const T *>(inputs[0]);
const auto ndim = info.getNdim();
const auto *output_shape = info.getOutputShape();
const auto *output_strides = info.getOutputStrides();
const auto *input_shape = info.getInputShape(0);
const auto *input_strides = info.getInputStrides(0);
const auto *input_contiguous = info.getInputContiguous();
ptrdiff_t output_size = info.getOutputSize();
#pragma omp parallel for if (output_size > 1024)
for (ptrdiff_t i = 0; i < output_size; ++i) {
const size_t out_idx = info.isOutputContiguous()
? static_cast<size_t>(i)
: op::common_cpu::indexToOffset(i, ndim, output_shape, output_strides);
const size_t in_idx = input_contiguous[0]
? static_cast<size_t>(i)
: op::common_cpu::indexToOffset(i, ndim, input_shape, input_strides);
if constexpr (std::is_same_v<T, fp16_t> || std::is_same_v<T, bf16_t>) {
float value = utils::cast<float>(in[in_idx]);
float clamped = HardTanhOp{}(value, min_val, max_val);
out[out_idx] = utils::cast<T>(clamped);
} else {
out[out_idx] = HardTanhOp{}(in[in_idx], min_val, max_val);
}
}
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
(void)workspace;
(void)workspace_size;
(void)stream;
if (inputs.size() != 1) {
return INFINI_STATUS_BAD_PARAM;
}
switch (_dtype) {
case INFINI_DTYPE_BF16:
return launchCpuHardTanh<bf16_t>(_info, output, inputs, _min_val, _max_val);
case INFINI_DTYPE_F16:
return launchCpuHardTanh<fp16_t>(_info, output, inputs, _min_val, _max_val);
case INFINI_DTYPE_F32:
return launchCpuHardTanh<float>(_info, output, inputs, _min_val, _max_val);
case INFINI_DTYPE_F64:
return launchCpuHardTanh<double>(_info, output, inputs, _min_val, _max_val);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
} // namespace op::hardtanh::cpu
#ifndef __HARDTANH_CPU_H__
#define __HARDTANH_CPU_H__
#include "../../../elementwise/cpu/elementwise_cpu.h"
#include <algorithm>
namespace op::hardtanh::cpu {
class Descriptor final : public InfiniopDescriptor {
infiniDtype_t _dtype;
op::elementwise::ElementwiseInfo _info;
size_t _workspace_size;
float _min_val;
float _max_val;
Descriptor(infiniDtype_t dtype,
op::elementwise::ElementwiseInfo info,
size_t workspace_size,
infiniDevice_t device_type,
int device_id,
float min_val,
float max_val);
public:
~Descriptor();
size_t workspaceSize() const { return _workspace_size; }
static infiniStatus_t create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec,
float min_val,
float max_val);
infiniStatus_t calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const;
float minVal() const { return _min_val; }
float maxVal() const { return _max_val; }
};
typedef struct HardTanhOp {
public:
static constexpr size_t num_inputs = 1;
template <typename T>
T operator()(const T &x, float min_val, float max_val) const {
T low = static_cast<T>(min_val);
T high = static_cast<T>(max_val);
T val = x < low ? low : x;
return val > high ? high : val;
}
} HardTanhOp;
} // namespace op::hardtanh::cpu
#endif
#ifndef __HARDTANH_CUDA_H__
#define __HARDTANH_CUDA_H__
#if defined(__MACACC__)
#include <maca_bfloat16.h>
#include <maca_fp16.h>
#else
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#endif
#include <type_traits>
namespace op::hardtanh::cuda {
typedef struct HardTanhOp {
public:
static constexpr size_t num_inputs = 1;
template <typename T>
__device__ __forceinline__ T operator()(const T &x, float min_val, float max_val) const {
if constexpr (std::is_same_v<T, half2>) {
float2 x_f2 = __half22float2(x);
x_f2.x = fminf(max_val, fmaxf(min_val, x_f2.x));
x_f2.y = fminf(max_val, fmaxf(min_val, x_f2.y));
return __float22half2_rn(x_f2);
} else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
float x_f = __bfloat162float(x);
return __float2bfloat16(fminf(max_val, fmaxf(min_val, x_f)));
} else if constexpr (std::is_same_v<T, half>) {
float x_f = __half2float(x);
return __float2half(fminf(max_val, fmaxf(min_val, x_f)));
} else if constexpr (std::is_same_v<T, float>) {
return fminf(max_val, fmaxf(min_val, x));
} else if constexpr (std::is_same_v<T, double>) {
return fmin((double)max_val, fmax((double)min_val, x));
}
}
} HardTanhOp;
} // namespace op::hardtanh::cuda
#endif
#ifndef __HARDTANH_METAX_API_H__
#define __HARDTANH_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax_api.h"
namespace op::hardtanh::metax {
class Descriptor final : public InfiniopDescriptor {
infiniDtype_t _dtype;
op::elementwise::ElementwiseInfo _info;
std::unique_ptr<op::elementwise::metax::DeviceImpl> _device_info;
size_t _workspace_size;
float _min_val;
float _max_val;
Descriptor(infiniDtype_t dtype,
op::elementwise::ElementwiseInfo info,
op::elementwise::metax::DeviceImpl *device_info,
size_t workspace_size,
infiniDevice_t device_type,
int device_id,
float min_val,
float max_val);
public:
~Descriptor();
size_t workspaceSize() const { return _workspace_size; }
static infiniStatus_t create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec,
float min_val,
float max_val);
infiniStatus_t calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const;
};
} // namespace op::hardtanh::metax
#endif // __HARDTANH_METAX_API_H__
#include "hardtanh_metax.h"
#include "../../../elementwise/metax/elementwise_metax.h"
#include "../cuda/kernel.cuh"
namespace op::hardtanh::metax {
Descriptor::Descriptor(infiniDtype_t dtype,
op::elementwise::ElementwiseInfo info,
op::elementwise::metax::DeviceImpl *device_info,
size_t workspace_size,
infiniDevice_t device_type,
int device_id,
float min_val,
float max_val)
: InfiniopDescriptor{device_type, device_id},
_dtype(dtype),
_info(std::move(info)),
_device_info(device_info),
_workspace_size(workspace_size),
_min_val(min_val),
_max_val(max_val) {}
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec,
float min_val,
float max_val) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &input_desc = input_desc_vec.at(0);
const auto &output_shape = out_desc->shape();
const auto &input_shape = input_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_SAME_SHAPE(output_shape, input_shape);
auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec);
CHECK_RESULT(info_result);
auto info = info_result.take();
auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *);
auto device_impl_result = op::elementwise::metax::DeviceImpl::create(handle->internal());
CHECK_RESULT(device_impl_result);
*desc_ptr = new Descriptor(
dtype,
std::move(info),
device_impl_result.take(),
workspace_size,
handle->device,
handle->device_id,
min_val,
max_val);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::HardTanhOp, cuda_bfloat16>(
_info, workspace, output, inputs, stream, _min_val, _max_val);
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::HardTanhOp, half>(
_info, workspace, output, inputs, stream, _min_val, _max_val);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::HardTanhOp, float>(
_info, workspace, output, inputs, stream, _min_val, _max_val);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::HardTanhOp, double>(
_info, workspace, output, inputs, stream, _min_val, _max_val);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
} // namespace op::hardtanh::metax
#ifndef __HARDTANH_MOORE_API_H__
#define __HARDTANH_MOORE_API_H__
#include "../../../elementwise/moore/elementwise_moore_api.h"
namespace op::hardtanh::moore {
class Descriptor final : public InfiniopDescriptor {
infiniDtype_t _dtype;
op::elementwise::ElementwiseInfo _info;
std::unique_ptr<op::elementwise::moore::DeviceImpl> _device_info;
size_t _workspace_size;
float _min_val;
float _max_val;
Descriptor(infiniDtype_t dtype,
op::elementwise::ElementwiseInfo info,
op::elementwise::moore::DeviceImpl *device_info,
size_t workspace_size,
infiniDevice_t device_type,
int device_id,
float min_val,
float max_val);
public:
~Descriptor();
size_t workspaceSize() const { return _workspace_size; }
static infiniStatus_t create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec,
float min_val,
float max_val);
infiniStatus_t calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const;
float minVal() const { return _min_val; }
float maxVal() const { return _max_val; }
};
} // namespace op::hardtanh::moore
#endif // __HARDTANH_MOORE_API_H__
#include "hardtanh_moore.h"
#include "../../../elementwise/moore/elementwise_moore.h"
#include "hardtanh_moore_kernel.h"
namespace op::hardtanh::moore {
namespace {
inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) {
return info.isOutputContiguous() && info.getInputSize() == 1 &&
info.getInputContiguous()[0] && !info.getInputBroadcasted()[0];
}
template <typename T>
INFINIOP_MOORE_KERNEL hardtanh_contiguous_kernel(size_t numel,
T *out,
const T *in,
float min_val,
float max_val) {
const auto op = op::hardtanh::moore::HardTanhOp{};
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
size_t stride = blockDim.x * gridDim.x;
for (; idx < numel; idx += stride) {
out[idx] = op(in[idx], min_val, max_val);
}
}
template <typename T>
infiniStatus_t launch_fast_path(size_t numel,
void *output,
const std::vector<const void *> &inputs,
void *stream,
float min_val,
float max_val) {
if (numel == 0) {
return INFINI_STATUS_SUCCESS;
}
constexpr int kBlockSize = 256;
int grid = static_cast<int>((numel + kBlockSize - 1) / kBlockSize);
if (grid > 65535) {
grid = 65535;
}
auto musa_stream = reinterpret_cast<musaStream_t>(stream);
hardtanh_contiguous_kernel<T><<<grid, kBlockSize, 0, musa_stream>>>(
numel,
reinterpret_cast<T *>(output),
reinterpret_cast<const T *>(inputs[0]),
min_val,
max_val);
return INFINI_STATUS_SUCCESS;
}
} // namespace
Descriptor::Descriptor(infiniDtype_t dtype,
op::elementwise::ElementwiseInfo info,
op::elementwise::moore::DeviceImpl *device_info,
size_t workspace_size,
infiniDevice_t device_type,
int device_id,
float min_val,
float max_val)
: InfiniopDescriptor{device_type, device_id},
_dtype(dtype),
_info(std::move(info)),
_device_info(device_info),
_workspace_size(workspace_size),
_min_val(min_val),
_max_val(max_val) {}
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec,
float min_val,
float max_val) {
auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &input_desc = input_desc_vec.at(0);
const auto &output_shape = out_desc->shape();
const auto &input_shape = input_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_SAME_SHAPE(output_shape, input_shape);
auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec);
CHECK_RESULT(info_result);
auto info = info_result.take();
auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *);
auto device_impl_result = op::elementwise::moore::DeviceImpl::create(handle->internal());
CHECK_RESULT(device_impl_result);
*desc_ptr = new Descriptor(
dtype,
std::move(info),
device_impl_result.take(),
workspace_size,
handle->device,
handle->device_id,
min_val,
max_val);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
const bool fast_path = can_use_contiguous_fast_path(_info);
if (fast_path) {
switch (_dtype) {
case INFINI_DTYPE_BF16:
return launch_fast_path<cuda_bfloat16>(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val);
case INFINI_DTYPE_F16:
return launch_fast_path<half>(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val);
case INFINI_DTYPE_F32:
return launch_fast_path<float>(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val);
case INFINI_DTYPE_F64:
return launch_fast_path<double>(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val);
default:
break;
}
}
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, moore::HardTanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream, _min_val, _max_val);
case INFINI_DTYPE_F16:
return _device_info->calculate<256, moore::HardTanhOp, half>(_info, workspace, output, inputs, stream, _min_val, _max_val);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, moore::HardTanhOp, float>(_info, workspace, output, inputs, stream, _min_val, _max_val);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, moore::HardTanhOp, double>(_info, workspace, output, inputs, stream, _min_val, _max_val);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::hardtanh::moore
#ifndef __HARDTANH_MOORE_KERNEL_H__
#define __HARDTANH_MOORE_KERNEL_H__
#include <cmath>
#include <type_traits>
namespace op::hardtanh::moore {
typedef struct HardTanhOp {
public:
static constexpr size_t num_inputs = 1;
template <typename T>
__device__ __forceinline__ T operator()(const T &x, float min_val, float max_val) const {
if constexpr (std::is_same_v<T, half>) {
float x_f = __half2float(x);
return __float2half(fminf(max_val, fmaxf(min_val, x_f)));
} else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
float x_f = __bfloat162float(x);
return __float2bfloat16_rn(fminf(max_val, fmaxf(min_val, x_f)));
} else if constexpr (std::is_same_v<T, float>) {
return fminf(max_val, fmaxf(min_val, x));
} else if constexpr (std::is_same_v<T, double>) {
return fmin((double)max_val, fmax((double)min_val, x));
} else {
float x_f = static_cast<float>(x);
return static_cast<T>(fminf(max_val, fmaxf(min_val, x_f)));
}
}
} HardTanhOp;
} // namespace op::hardtanh::moore
#endif // __HARDTANH_MOORE_KERNEL_H__
#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
#include "../cuda/kernel.cuh"
#include "hardtanh_nvidia.cuh"
#include <cuda_runtime.h>
namespace op::hardtanh::nvidia {
namespace {
inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) {
return info.isOutputContiguous() && info.getInputSize() == 1 && info.getInputContiguous()[0] && !info.getInputBroadcasted()[0];
}
template <typename T>
__global__ void hardtanh_contiguous_kernel(size_t numel, T *out, const T *in, float min_val, float max_val) {
const auto op = op::hardtanh::cuda::HardTanhOp{};
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
while (idx < numel) {
out[idx] = op(in[idx], min_val, max_val);
idx += blockDim.x * gridDim.x;
}
}
template <typename T>
infiniStatus_t launch_fast_path(size_t numel,
void *output,
const std::vector<const void *> &inputs,
void *stream,
float min_val,
float max_val) {
if (numel == 0) {
return INFINI_STATUS_SUCCESS;
}
constexpr int BLOCK_SIZE = 256;
int grid = static_cast<int>((numel + BLOCK_SIZE - 1) / BLOCK_SIZE);
grid = std::min(grid, 65535);
auto *out_ptr = reinterpret_cast<T *>(output);
auto *in_ptr = reinterpret_cast<const T *>(inputs[0]);
auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
hardtanh_contiguous_kernel<<<grid, BLOCK_SIZE, 0, cuda_stream>>>(numel, out_ptr, in_ptr, min_val, max_val);
cudaError_t err = cudaGetLastError();
return err == cudaSuccess ? INFINI_STATUS_SUCCESS : INFINI_STATUS_INTERNAL_ERROR;
}
} // namespace
Descriptor::Descriptor(infiniDtype_t dtype,
op::elementwise::ElementwiseInfo info,
op::elementwise::nvidia::DeviceImpl *device_info,
size_t workspace_size,
infiniDevice_t device_type,
int device_id,
float min_val,
float max_val)
: InfiniopDescriptor{device_type, device_id},
_dtype(dtype),
_info(std::move(info)),
_device_info(device_info),
_workspace_size(workspace_size),
_min_val(min_val),
_max_val(max_val) {}
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec,
float min_val,
float max_val) {
auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &input_desc = input_desc_vec.at(0);
const auto &output_shape = out_desc->shape();
const auto &input_shape = input_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_SAME_SHAPE(output_shape, input_shape);
auto info_result = op::elementwise::ElementwiseInfo::create(out_desc, input_desc_vec);
CHECK_RESULT(info_result);
auto info = info_result.take();
auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *);
auto device_impl_result = op::elementwise::nvidia::DeviceImpl::create(handle->internal());
CHECK_RESULT(device_impl_result);
*desc_ptr = new Descriptor(
dtype,
std::move(info),
device_impl_result.take(),
workspace_size,
handle->device,
handle->device_id,
min_val,
max_val);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
const bool fast_path = can_use_contiguous_fast_path(_info);
if (fast_path) {
switch (_dtype) {
case INFINI_DTYPE_BF16:
return launch_fast_path<cuda_bfloat16>(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val);
case INFINI_DTYPE_F16:
return launch_fast_path<half>(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val);
case INFINI_DTYPE_F32:
return launch_fast_path<float>(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val);
case INFINI_DTYPE_F64:
return launch_fast_path<double>(_info.getOutputSize(), output, inputs, stream, _min_val, _max_val);
default:
break;
}
}
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::HardTanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream, _min_val, _max_val);
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::HardTanhOp, half>(_info, workspace, output, inputs, stream, _min_val, _max_val);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::HardTanhOp, float>(_info, workspace, output, inputs, stream, _min_val, _max_val);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::HardTanhOp, double>(_info, workspace, output, inputs, stream, _min_val, _max_val);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::hardtanh::nvidia
#ifndef __HARDTANH_CUDA_API_H__
#define __HARDTANH_CUDA_API_H__
#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
namespace op::hardtanh::nvidia {
class Descriptor final : public InfiniopDescriptor {
infiniDtype_t _dtype;
op::elementwise::ElementwiseInfo _info;
std::unique_ptr<op::elementwise::nvidia::DeviceImpl> _device_info;
size_t _workspace_size;
float _min_val;
float _max_val;
Descriptor(infiniDtype_t dtype,
op::elementwise::ElementwiseInfo info,
op::elementwise::nvidia::DeviceImpl *device_info,
size_t workspace_size,
infiniDevice_t device_type,
int device_id,
float min_val,
float max_val);
public:
~Descriptor();
size_t workspaceSize() const { return _workspace_size; }
static infiniStatus_t create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec,
float min_val,
float max_val);
infiniStatus_t calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const;
float minVal() const { return _min_val; }
float maxVal() const { return _max_val; }
};
} // namespace op::hardtanh::nvidia
#endif
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/hardtanh.h"
#ifdef ENABLE_CPU_API
#include "cpu/hardtanh_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#include "nvidia/hardtanh_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
#include "metax/hardtanh_metax.h"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/hardtanh_moore.h"
#endif
__INFINI_C infiniStatus_t infiniopCreateHardTanhDescriptor(
infiniopHandle_t handle,
infiniopHardTanhDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
float min_val,
float max_val) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::hardtanh::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::hardtanh::NAMESPACE::Descriptor **>(desc_ptr), \
output_desc, \
{input_desc}, \
min_val, \
max_val)
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_MOORE_API
CREATE(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CREATE
}
__INFINI_C infiniStatus_t infiniopGetHardTanhWorkspaceSize(infiniopHardTanhDescriptor_t desc, size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::hardtanh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_MOORE_API
GET(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef GET
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__INFINI_C infiniStatus_t infiniopHardTanh(
infiniopHardTanhDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *output,
const void *input,
void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::hardtanh::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, output, {input}, stream)
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CALCULATE
}
__INFINI_C infiniStatus_t
infiniopDestroyHardTanhDescriptor(infiniopHardTanhDescriptor_t desc) {
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::hardtanh::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_MOORE_API
DELETE(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef DELETE
}
......@@ -13,22 +13,6 @@ struct CustomBFloat16 {
};
typedef struct CustomBFloat16 bf16_t;
inline bool operator==(const CustomFloat16 &lhs, const CustomFloat16 &rhs) {
return lhs._v == rhs._v;
}
inline bool operator!=(const CustomFloat16 &lhs, const CustomFloat16 &rhs) {
return !(lhs == rhs);
}
inline bool operator==(const CustomBFloat16 &lhs, const CustomBFloat16 &rhs) {
return lhs._v == rhs._v;
}
inline bool operator!=(const CustomBFloat16 &lhs, const CustomBFloat16 &rhs) {
return !(lhs == rhs);
}
float _f16_to_f32(fp16_t val);
fp16_t _f32_to_f16(float val);
......
......@@ -74,8 +74,9 @@ class OpTest(BaseOperatorTest):
def torch_operator(self, *args, **kwargs):
return torch.nn.functional.avg_pool1d(*args, **kwargs)
def infinicore_operator(self, *args, **kwargs):
return infinicore.nn.functional.avg_pool1d(*args, **kwargs)
# def infinicore_operator(self, *args, **kwargs):
# """InfiniCore implementation (operator not yet available)."""
# return infinicore.nn.functional.avg_pool1d(*args, **kwargs)
def main():
......
......@@ -11,8 +11,6 @@ from framework.tensor import TensorInitializer
# Test cases format: (input_shape_logits_N_C, target_shape_N, input_strides_or_None, weight_present_bool, ignore_index_or_None)
# infinicore.nn.functional.cross_entropy(input, target, weight=None, ignore_index=-100, reduction='mean')
# CrossEntropy kernel当前只支持逐元素loss且不带class weight/ignore_index。
# 仍然保留原始配置,后续实现这些特性时只需放开过滤条件即可。
_TEST_CASES_DATA = [
((4, 5), (4,), None, False, None),
((8, 10), (8,), None, True, -1),
......@@ -22,9 +20,6 @@ _TEST_CASES_DATA = [
((2, 2), (2,), None, True, -100),
]
_SUPPORT_WEIGHT = False
_SUPPORT_IGNORE_INDEX = False
_TOLERANCE_MAP = {
infinicore.float16: {"atol": 1e-3, "rtol": 1e-2},
infinicore.float32: {"atol": 1e-5, "rtol": 1e-4},
......@@ -45,11 +40,6 @@ def parse_test_cases():
) in _TEST_CASES_DATA:
for dtype in _TENSOR_DTYPES:
tol = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4})
if weight_present and not _SUPPORT_WEIGHT:
continue
if ignore_index is not None and not _SUPPORT_IGNORE_INDEX:
continue
logits = TensorSpec.from_tensor(logits_shape, logits_strides, dtype)
target = TensorSpec.from_tensor(
target_shape,
......@@ -61,7 +51,7 @@ def parse_test_cases():
)
inputs = [logits, target]
kwargs = {"reduction": "none"}
kwargs = {}
if weight_present:
weight_spec = TensorSpec.from_tensor((logits_shape[1],), None, dtype)
inputs.append(weight_spec)
......@@ -94,10 +84,9 @@ class OpTest(BaseOperatorTest):
def torch_operator(self, *args, **kwargs):
return torch.nn.functional.cross_entropy(*args, **kwargs)
def infinicore_operator(self, *args, **kwargs):
"""InfiniCore implementation."""
out = kwargs.pop("out", None)
return infinicore.cross_entropy(*args, out=out, **kwargs)
# def infinicore_operator(self, *args, **kwargs):
# """InfiniCore implementation (operator not yet available)."""
# return infinicore.nn.functional.cross_entropy(*args, **kwargs)
def main():
......
......@@ -74,11 +74,8 @@ def parse_test_cases():
)
)
# Equal 结果为 bool,无法安全复用浮点/整型输入作为输出缓冲区。
# 只有当输入 dtype 本身为 bool 时才允许 inplace,这里提前留出开关。
allow_input_inplace = dtype == infinicore.bool
if allow_input_inplace and a_supports_inplace:
# in-place a
if a_supports_inplace:
test_cases.append(
TestCase(
inputs=[a_spec, b_spec],
......@@ -90,7 +87,8 @@ def parse_test_cases():
)
)
if allow_input_inplace and b_supports_inplace:
# in-place b
if b_supports_inplace:
test_cases.append(
TestCase(
inputs=[a_spec, b_spec],
......@@ -117,8 +115,9 @@ class OpTest(BaseOperatorTest):
def torch_operator(self, *args, **kwargs):
return torch.eq(*args, **kwargs)
def infinicore_operator(self, *args, **kwargs):
return infinicore.equal(*args, **kwargs)
# def infinicore_operator(self, *args, **kwargs):
# """InfiniCore implementation (operator not yet available)."""
# return infinicore.eq(*args, **kwargs)
def main():
......
......@@ -70,8 +70,9 @@ class OpTest(BaseOperatorTest):
def torch_operator(self, *args, **kwargs):
return torch.nn.functional.hardswish(*args, **kwargs)
def infinicore_operator(self, *args, **kwargs):
return infinicore.nn.functional.hardswish(*args, **kwargs)
# def infinicore_operator(self, *args, **kwargs):
# """InfiniCore implementation (operator not yet available)."""
# return infinicore.nn.functional.hardswish(*args, **kwargs)
def main():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment