Commit def22a08 authored by wooway777's avatar wooway777
Browse files

Revert "Merge pull request #1056 from InfiniTensor/issue/1031"

This reverts commit 7f295448, reversing
changes made to e60985dc.
parent 1795b38a
#ifndef __CROSS_ENTROPY_NVIDIA_H__
#define __CROSS_ENTROPY_NVIDIA_H__
#include "../cross_entropy.h"
DESCRIPTOR(nvidia)
#endif
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/cross_entropy.h"
#ifdef ENABLE_CPU_API
#include "cpu/cross_entropy_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
#include "nvidia/cross_entropy_nvidia.cuh"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/cross_entropy_moore.h"
#endif
#ifdef ENABLE_METAX_API
#include "metax/cross_entropy_metax.h"
#endif
__INFINI_C infiniStatus_t infiniopCreateCrossEntropyDescriptor(
infiniopHandle_t handle,
infiniopCrossEntropyDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t target_desc) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::cross_entropy::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::cross_entropy::NAMESPACE::Descriptor **>(desc_ptr), \
y_desc, x_desc, target_desc);
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu)
#endif
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia)
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia)
#endif
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia)
#endif
#ifdef ENABLE_HYGON_API
CREATE(INFINI_DEVICE_HYGON, nvidia)
#endif
#ifdef ENABLE_MOORE_API
CREATE(INFINI_DEVICE_MOORE, moore)
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax)
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CREATE
}
__INFINI_C infiniStatus_t infiniopGetCrossEntropyWorkspaceSize(
infiniopCrossEntropyDescriptor_t desc, size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::cross_entropy::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu)
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia)
#endif
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia)
#endif
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia)
#endif
#ifdef ENABLE_HYGON_API
GET(INFINI_DEVICE_HYGON, nvidia)
#endif
#ifdef ENABLE_MOORE_API
GET(INFINI_DEVICE_MOORE, moore)
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax)
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef GET
}
__INFINI_C infiniStatus_t infiniopCrossEntropy(
infiniopCrossEntropyDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
const void *target,
void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<op::cross_entropy::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, y, x, target, stream);
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu)
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia)
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia)
#endif
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia)
#endif
#ifdef ENABLE_HYGON_API
CALCULATE(INFINI_DEVICE_HYGON, nvidia)
#endif
#ifdef ENABLE_MOORE_API
CALCULATE(INFINI_DEVICE_MOORE, moore)
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, metax)
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CALCULATE
}
__INFINI_C infiniStatus_t infiniopDestroyCrossEntropyDescriptor(
infiniopCrossEntropyDescriptor_t desc) {
#define DESTROY(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<op::cross_entropy::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
DESTROY(INFINI_DEVICE_CPU, cpu)
#endif
#ifdef ENABLE_NVIDIA_API
DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
#endif
#ifdef ENABLE_ILUVATAR_API
DESTROY(INFINI_DEVICE_ILUVATAR, nvidia)
#endif
#ifdef ENABLE_QY_API
DESTROY(INFINI_DEVICE_QY, nvidia)
#endif
#ifdef ENABLE_HYGON_API
DESTROY(INFINI_DEVICE_HYGON, nvidia)
#endif
#ifdef ENABLE_MOORE_API
DESTROY(INFINI_DEVICE_MOORE, moore)
#endif
#ifdef ENABLE_METAX_API
DESTROY(INFINI_DEVICE_METAX, metax)
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef DESTROY
}
#include <cstdint>
#include <type_traits>
#include "equal_cpu.h"
namespace op::equal::cpu {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
const auto &a_desc = input_desc_vec.at(0);
const auto &b_desc = input_desc_vec.at(1);
auto compute_dtype = a_desc->dtype();
auto out_dtype = out_desc->dtype();
if (compute_dtype != b_desc->dtype()) {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL);
CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64,
INFINI_DTYPE_BF16, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
const auto &c_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
const auto &b_shape = b_desc->shape();
CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<EqualOp, bool, fp16_t, fp16_t>(_info, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<EqualOp, bool, float, float>(_info, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<EqualOp, bool, double, double>(_info, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<EqualOp, bool, bf16_t, bf16_t>(_info, output, inputs, stream);
case INFINI_DTYPE_I32:
return _device_info->calculate<EqualOp, bool, int32_t, int32_t>(_info, output, inputs, stream);
case INFINI_DTYPE_I64:
return _device_info->calculate<EqualOp, bool, int64_t, int64_t>(_info, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::equal::cpu
#ifndef __EQUAL_CPU_H__
#define __EQUAL_CPU_H__
#include <type_traits>
#include "../../../elementwise/cpu/elementwise_cpu.h"
ELEMENTWISE_DESCRIPTOR(equal, cpu)
namespace op::equal::cpu {
typedef struct EqualOp {
public:
static constexpr size_t num_inputs = 2;
template <typename Tout, typename Tin0, typename Tin1>
bool operator()(const Tin0 &a, const Tin1 &b) {
if constexpr (std::is_same_v<Tin0, Tin1>) {
return a == b;
} else {
return false;
}
}
} EqualOp;
} // namespace op::equal::cpu
#endif
#ifndef __EQUAL_CUDA_H__
#define __EQUAL_CUDA_H__
#if defined(__MACACC__)
#include <maca_bfloat16.h>
#include <maca_fp16.h>
#else
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#endif
#include <type_traits>
namespace op::equal::cuda {
typedef struct EqualOp {
public:
static constexpr size_t num_inputs = 2;
template <typename Tout, typename Tin0, typename Tin1>
__device__ __forceinline__ bool operator()(const Tin0 &a, const Tin1 &b) const {
if constexpr (std::is_same_v<Tin0, Tin1>) {
if constexpr (std::is_same_v<Tin0, half2>) {
static_assert(!std::is_same_v<Tin0, half2>, "half2 is not supported for mixed output dtype");
} else if constexpr (std::is_same_v<Tin0, half>) {
return static_cast<Tout>(__heq(a, b));
} else {
return static_cast<Tout>(a == b);
}
} else {
return false;
}
}
} EqualOp;
} // namespace op::equal::cuda
#endif
#ifndef __EQUAL_METAX_API_H__
#define __EQUAL_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR(equal, metax)
#endif // __EQUAL_METAX_API_H__
#include "equal_metax.h"
#include "../../../elementwise/metax/elementwise_metax.h"
#include "../cuda/kernel.cuh"
namespace op::equal::metax {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
const auto &a_desc = input_desc_vec.at(0);
auto compute_dtype = a_desc->dtype();
auto out_dtype = out_desc->dtype();
const auto &b_desc = input_desc_vec.at(1);
const auto &c_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
const auto &b_shape = b_desc->shape();
CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16,
INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64);
CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL);
CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::EqualOp, bool, half, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::EqualOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::EqualOp, bool, float, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I32:
return _device_info->calculate<256, cuda::EqualOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I64:
return _device_info->calculate<256, cuda::EqualOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::EqualOp, bool, double, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
} // namespace op::equal::metax
#ifndef __EQUAL_MOORE_API_H__
#define __EQUAL_MOORE_API_H__
#include "../../../elementwise/moore/elementwise_moore_api.h"
ELEMENTWISE_DESCRIPTOR(equal, moore)
#endif // __EQUAL_MOORE_API_H__
#include "equal_moore.h"
#include "../../../elementwise/moore/elementwise_moore.h"
#include "equal_moore_kernel.h"
namespace op::equal::moore {
namespace {
inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) {
if (!info.isOutputContiguous()) {
return false;
}
const bool *input_contiguous = info.getInputContiguous();
const bool *input_broadcasted = info.getInputBroadcasted();
for (size_t i = 0; i < 2; ++i) {
if (!input_contiguous[i] || input_broadcasted[i]) {
return false;
}
}
return true;
}
template <typename Tout, typename Tin>
INFINIOP_MOORE_KERNEL equal_contiguous_kernel(size_t numel, Tout *output, const Tin *a, const Tin *b) {
const auto op = op::equal::moore::EqualOp{};
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
size_t stride = blockDim.x * gridDim.x;
for (; idx < numel; idx += stride) {
output[idx] = op.template operator()<Tout, Tin>(a[idx], b[idx]);
}
}
template <typename Tout, typename Tin>
infiniStatus_t launch_fast_path(size_t numel,
void *output,
const std::vector<const void *> &inputs,
void *stream) {
if (numel == 0) {
return INFINI_STATUS_SUCCESS;
}
constexpr int kBlockSize = 256;
int grid = static_cast<int>((numel + kBlockSize - 1) / kBlockSize);
if (grid > 65535) {
grid = 65535;
}
auto musa_stream = reinterpret_cast<musaStream_t>(stream);
equal_contiguous_kernel<Tout, Tin><<<grid, kBlockSize, 0, musa_stream>>>(
numel,
reinterpret_cast<Tout *>(output),
reinterpret_cast<const Tin *>(inputs[0]),
reinterpret_cast<const Tin *>(inputs[1]));
return INFINI_STATUS_SUCCESS;
}
} // namespace
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
const auto &a_desc = input_desc_vec.at(0);
auto compute_dtype = a_desc->dtype();
auto out_dtype = out_desc->dtype();
const auto &b_desc = input_desc_vec.at(1);
const auto &c_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
const auto &b_shape = b_desc->shape();
CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16,
INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64);
CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL);
CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
// create MOORE elementwise descriptor
CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (can_use_contiguous_fast_path(_info)) {
size_t numel = _info.getOutputSize();
switch (_dtype) {
case INFINI_DTYPE_F16:
return launch_fast_path<bool, half>(numel, output, inputs, stream);
case INFINI_DTYPE_BF16:
return launch_fast_path<bool, cuda_bfloat16>(numel, output, inputs, stream);
case INFINI_DTYPE_F32:
return launch_fast_path<bool, float>(numel, output, inputs, stream);
case INFINI_DTYPE_I32:
return launch_fast_path<bool, int32_t>(numel, output, inputs, stream);
case INFINI_DTYPE_I64:
return launch_fast_path<bool, int64_t>(numel, output, inputs, stream);
case INFINI_DTYPE_F64:
return launch_fast_path<bool, double>(numel, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, moore::EqualOp, bool, half, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, moore::EqualOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, moore::EqualOp, bool, float, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I32:
return _device_info->calculate<256, moore::EqualOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I64:
return _device_info->calculate<256, moore::EqualOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, moore::EqualOp, bool, double, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
} // namespace op::equal::moore
#ifndef __EQUAL_MOORE_KERNEL_H__
#define __EQUAL_MOORE_KERNEL_H__
#include <type_traits>
namespace op::equal::moore {
typedef struct EqualOp {
public:
static constexpr size_t num_inputs = 2;
template <typename Tout, typename Tin0, typename Tin1>
__device__ __forceinline__ bool operator()(const Tin0 &a, const Tin1 &b) const {
if constexpr (std::is_same_v<Tin0, Tin1>) {
if constexpr (std::is_same_v<Tin0, half>) {
return __half2float(a) == __half2float(b);
} else if constexpr (std::is_same_v<Tin0, cuda_bfloat16>) {
return __bfloat162float(a) == __bfloat162float(b);
} else {
return a == b;
}
} else {
return false;
}
}
} EqualOp;
} // namespace op::equal::moore
#endif // __EQUAL_MOORE_KERNEL_H__
#include <algorithm>
#include <cstdint>
#include <type_traits>
#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
#include "../cuda/kernel.cuh"
#include "equal_nvidia.cuh"
namespace {
template <typename Tout, typename Tin>
INFINIOP_CUDA_KERNEL FastEqualKernel(size_t n, Tout *output, const Tin *a, const Tin *b) {
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
size_t stride = blockDim.x * gridDim.x;
op::equal::cuda::EqualOp op{};
for (; idx < n; idx += stride) {
output[idx] = op.template operator()<Tout, Tin>(a[idx], b[idx]);
}
}
template <typename Tout, typename Tin>
infiniStatus_t launchFastEqualKernel(size_t numel,
void *output,
const std::vector<const void *> &inputs,
void *stream) {
if (numel == 0) {
return INFINI_STATUS_SUCCESS;
}
constexpr int block = 256;
int grid = static_cast<int>((numel + block - 1) / block);
grid = std::min(grid, 65535);
auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
FastEqualKernel<Tout, Tin><<<grid, block, 0, cuda_stream>>>(
numel,
reinterpret_cast<Tout *>(output),
reinterpret_cast<const Tin *>(inputs[0]),
reinterpret_cast<const Tin *>(inputs[1]));
auto err = cudaGetLastError();
return err == cudaSuccess ? INFINI_STATUS_SUCCESS : INFINI_STATUS_INTERNAL_ERROR;
}
} // namespace
namespace op::equal::nvidia {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
const auto &a_desc = input_desc_vec.at(0);
auto compute_dtype = a_desc->dtype();
auto out_dtype = out_desc->dtype();
const auto &b_desc = input_desc_vec.at(1);
const auto &c_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
const auto &b_shape = b_desc->shape();
CHECK_DTYPE(compute_dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16,
INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_F64);
CHECK_DTYPE(out_dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_U8, INFINI_DTYPE_I8);
CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, compute_dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
bool fast_path = _info.isOutputContiguous();
if (fast_path) {
const bool *input_contiguous = _info.getInputContiguous();
const bool *input_broadcasted = _info.getInputBroadcasted();
for (size_t i = 0; i < 2; ++i) {
fast_path &= input_contiguous[i] && !input_broadcasted[i];
}
}
if (fast_path) {
size_t numel = _info.getOutputSize();
switch (_dtype) {
case INFINI_DTYPE_F16:
return launchFastEqualKernel<bool, half>(numel, output, inputs, stream);
case INFINI_DTYPE_BF16:
return launchFastEqualKernel<bool, cuda_bfloat16>(numel, output, inputs, stream);
case INFINI_DTYPE_F32:
return launchFastEqualKernel<bool, float>(numel, output, inputs, stream);
case INFINI_DTYPE_I32:
return launchFastEqualKernel<bool, int32_t>(numel, output, inputs, stream);
case INFINI_DTYPE_I64:
return launchFastEqualKernel<bool, int64_t>(numel, output, inputs, stream);
case INFINI_DTYPE_F64:
return launchFastEqualKernel<bool, double>(numel, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::EqualOp, bool, half, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::EqualOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::EqualOp, bool, float, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I32:
return _device_info->calculate<256, cuda::EqualOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I64:
return _device_info->calculate<256, cuda::EqualOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::EqualOp, bool, double, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::equal::nvidia
#ifndef __EQUAL_CUDA_API_H__
#define __EQUAL_CUDA_API_H__
#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
ELEMENTWISE_DESCRIPTOR(equal, nvidia)
#endif
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/equal.h"
#ifdef ENABLE_CPU_API
#include "cpu/equal_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
#include "nvidia/equal_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
#include "metax/equal_metax.h"
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/equal_kunlun.h"
#endif
#ifdef ENABLE_CAMBRICON_API
#include "bang/equal_bang.h"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/equal_moore.h"
#endif
__INFINI_C infiniStatus_t infiniopCreateEqualDescriptor(
infiniopHandle_t handle,
infiniopEqualDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t b_desc) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::equal::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::equal::NAMESPACE::Descriptor **>(desc_ptr), \
c_desc, \
{a_desc, b_desc})
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
CREATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_CAMBRICON_API
CREATE(INFINI_DEVICE_CAMBRICON, bang);
#endif
#ifdef ENABLE_MOORE_API
CREATE(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CREATE
}
__INFINI_C infiniStatus_t infiniopGetEqualWorkspaceSize(infiniopEqualDescriptor_t desc, size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::equal::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
GET(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_CAMBRICON_API
GET(INFINI_DEVICE_CAMBRICON, bang);
#endif
#ifdef ENABLE_MOORE_API
GET(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef GET
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__INFINI_C infiniStatus_t infiniopEqual(
infiniopEqualDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *c,
const void *a,
const void *b,
void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::equal::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, c, {a, b}, stream)
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_CAMBRICON_API
CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CALCULATE
}
__INFINI_C infiniStatus_t
infiniopDestroyEqualDescriptor(infiniopEqualDescriptor_t desc) {
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::equal::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
DELETE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
DELETE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_CAMBRICON_API
DELETE(INFINI_DEVICE_CAMBRICON, bang);
#endif
#ifdef ENABLE_MOORE_API
DELETE(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef DELETE
}
#include "hardswish_cpu.h"
#include <cstddef>
namespace op::hardswish::cpu {
namespace {
inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) {
return info.isOutputContiguous() && info.getInputSize() == 1 && info.getInputContiguous()[0] && !info.getInputBroadcasted()[0];
}
template <typename T>
infiniStatus_t launch_contiguous_cpu(const op::elementwise::ElementwiseInfo &info,
void *output,
const std::vector<const void *> &inputs) {
const T *in = reinterpret_cast<const T *>(inputs[0]);
T *out = reinterpret_cast<T *>(output);
const ptrdiff_t size = static_cast<ptrdiff_t>(info.getOutputSize());
#pragma omp parallel for if (size > 1024)
for (ptrdiff_t i = 0; i < size; ++i) {
out[i] = HardSwishOp{}(in[i]);
}
return INFINI_STATUS_SUCCESS;
}
} // namespace
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &input_desc = input_desc_vec.at(0);
const auto &output_shape = out_desc->shape();
const auto &input_shape = input_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_SAME_SHAPE(output_shape, input_shape);
CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
const bool fast_path = can_use_contiguous_fast_path(_info);
if (fast_path) {
switch (_dtype) {
case INFINI_DTYPE_BF16:
return launch_contiguous_cpu<bf16_t>(_info, output, inputs);
case INFINI_DTYPE_F16:
return launch_contiguous_cpu<fp16_t>(_info, output, inputs);
case INFINI_DTYPE_F32:
return launch_contiguous_cpu<float>(_info, output, inputs);
case INFINI_DTYPE_F64:
return launch_contiguous_cpu<double>(_info, output, inputs);
default:
break;
}
}
switch (_dtype) {
case INFINI_DTYPE_BF16:
return _device_info->calculate<HardSwishOp, bf16_t>(_info, output, inputs, stream);
case INFINI_DTYPE_F16:
return _device_info->calculate<HardSwishOp, fp16_t>(_info, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<HardSwishOp, float>(_info, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<HardSwishOp, double>(_info, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::hardswish::cpu
#ifndef __HARDSWISH_CPU_H__
#define __HARDSWISH_CPU_H__
#include "../../../elementwise/cpu/elementwise_cpu.h"
ELEMENTWISE_DESCRIPTOR(hardswish, cpu)
#include <algorithm>
#include <cmath>
namespace op::hardswish::cpu {
typedef struct HardSwishOp {
public:
static constexpr size_t num_inputs = 1;
template <typename T>
T operator()(const T &x) const {
const float x_f = utils::cast<float>(x);
const float clamped = std::min(std::max(x_f + 3.0f, 0.0f), 6.0f);
const float result = x_f * clamped * (1.0f / 6.0f);
return utils::cast<T>(result);
}
} HardSwishOp;
typedef struct HardSwishContiguousOp {
public:
static constexpr size_t num_inputs = 1;
template <typename T>
T operator()(const T &x) const {
T three = static_cast<T>(3);
T zero = static_cast<T>(0);
T six = static_cast<T>(6);
T scale = static_cast<T>(0.16666667f);
T val = x + three;
val = std::max(zero, val);
val = std::min(six, val);
return x * val * scale;
}
} HardSwishContiguousOp;
} // namespace op::hardswish::cpu
#endif
#ifndef __HARDSWISH_CUDA_H__
#define __HARDSWISH_CUDA_H__
#include <cmath>
#if defined(__MACACC__)
#include <maca_bfloat16.h>
#include <maca_fp16.h>
#else
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#endif
namespace op::hardswish::cuda {
typedef struct HardSwishOp {
public:
static constexpr size_t num_inputs = 1;
template <typename T>
__device__ __forceinline__ T operator()(const T &x) const {
if constexpr (std::is_same_v<T, half2>) {
const half2 three = __float2half2_rn(3.0f);
const half2 scale = __float2half2_rn(0.16666667f);
half2 val = __hadd2(x, three);
#if defined(ENABLE_ILUVATAR_API)
float2 val_f = __half22float2(val);
val_f.x = fminf(fmaxf(val_f.x, 0.0f), 6.0f);
val_f.y = fminf(fmaxf(val_f.y, 0.0f), 6.0f);
val = __floats2half2_rn(val_f.x, val_f.y);
#else
const half2 zero = __float2half2_rn(0.0f);
const half2 six = __float2half2_rn(6.0f);
#if __CUDA_ARCH__ >= 800
val = __hmin2(__hmax2(val, zero), six);
#else
val = __hmax2(val, zero);
val = __hmin2(val, six);
#endif
#endif
return __hmul2(__hmul2(x, val), scale);
}
else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
const float x_f = __bfloat162float(x);
const float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f);
return __float2bfloat16(x_f * val * 0.16666667f);
}
else if constexpr (std::is_same_v<T, half>) {
const float x_f = __half2float(x);
const float val = fminf(fmaxf(x_f + 3.0f, 0.0f), 6.0f);
return __float2half(x_f * val * 0.16666667f);
}
else if constexpr (std::is_same_v<T, float>) {
const float val = fminf(fmaxf(x + 3.0f, 0.0f), 6.0f);
return x * val * 0.16666667f;
}
else if constexpr (std::is_same_v<T, double>) {
const double val = fmin(fmax(x + 3.0, 0.0), 6.0);
return x * val * (1.0 / 6.0);
}
}
} HardSwishOp;
} // namespace op::hardswish::cuda
#endif
#ifndef __HARDSWISH_METAX_API_H__
#define __HARDSWISH_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR(hardswish, metax)
#endif // __HARDSWISH_METAX_API_H__
#include "hardswish_metax.h"
#include "../../../elementwise/metax/elementwise_metax.h"
#include "../cuda/kernel.cuh"
namespace op::hardswish::metax {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &input_desc = input_desc_vec.at(0);
const auto &output_shape = out_desc->shape();
const auto &input_shape = input_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_SAME_SHAPE(output_shape, input_shape);
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::HardSwishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::HardSwishOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::HardSwishOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::HardSwishOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
} // namespace op::hardswish::metax
#ifndef __HARDSWISH_MOORE_API_H__
#define __HARDSWISH_MOORE_API_H__
#include "../../../elementwise/moore/elementwise_moore_api.h"
ELEMENTWISE_DESCRIPTOR(hardswish, moore)
#endif // __HARDSWISH_MOORE_API_H__
#include "hardswish_moore.h"
#include "../../../elementwise/moore/elementwise_moore.h"
#include "hardswish_moore_kernel.h"
namespace op::hardswish::moore {
namespace {
inline bool can_use_contiguous_fast_path(const op::elementwise::ElementwiseInfo &info) {
return info.isOutputContiguous() && info.getInputSize() == 1 &&
info.getInputContiguous()[0] && !info.getInputBroadcasted()[0];
}
template <typename T>
INFINIOP_MOORE_KERNEL hardswish_contiguous_kernel(size_t numel, T *out, const T *in) {
const auto op = op::hardswish::moore::HardSwishOp{};
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
size_t stride = blockDim.x * gridDim.x;
for (; idx < numel; idx += stride) {
out[idx] = op(in[idx]);
}
}
template <typename T>
infiniStatus_t launch_fast_path(size_t numel,
void *output,
const std::vector<const void *> &inputs,
void *stream) {
if (numel == 0) {
return INFINI_STATUS_SUCCESS;
}
constexpr int kBlockSize = 256;
int grid = static_cast<int>((numel + kBlockSize - 1) / kBlockSize);
if (grid > 65535) {
grid = 65535;
}
auto musa_stream = reinterpret_cast<musaStream_t>(stream);
hardswish_contiguous_kernel<T><<<grid, kBlockSize, 0, musa_stream>>>(
numel,
reinterpret_cast<T *>(output),
reinterpret_cast<const T *>(inputs[0]));
return INFINI_STATUS_SUCCESS;
}
} // namespace
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &input_desc = input_desc_vec.at(0);
const auto &output_shape = out_desc->shape();
const auto &input_shape = input_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_SAME_SHAPE(output_shape, input_shape);
// create MOORE elementwise descriptor
CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
const bool fast_path = can_use_contiguous_fast_path(_info);
if (fast_path) {
switch (_dtype) {
case INFINI_DTYPE_BF16:
return launch_fast_path<cuda_bfloat16>(_info.getOutputSize(), output, inputs, stream);
case INFINI_DTYPE_F16:
return launch_fast_path<half>(_info.getOutputSize(), output, inputs, stream);
case INFINI_DTYPE_F32:
return launch_fast_path<float>(_info.getOutputSize(), output, inputs, stream);
case INFINI_DTYPE_F64:
return launch_fast_path<double>(_info.getOutputSize(), output, inputs, stream);
default:
break;
}
}
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, moore::HardSwishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F16:
return _device_info->calculate<256, moore::HardSwishOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, moore::HardSwishOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, moore::HardSwishOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::hardswish::moore
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment