Unverified Commit 9b8de584 authored by pengcheng888's avatar pengcheng888 Committed by GitHub
Browse files

issue/473 - the ones and zeros operators


Co-authored-by: default avatarpengcheng888 <pengcheng@example.com>
parent f5e6d729
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/ones.h"
#ifdef ENABLE_CPU_API
#include "cpu/ones_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#include "nvidia/ones_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
#include "metax/ones_metax.h"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/ones_moore.h"
#endif
__C infiniStatus_t infiniopCreateOnesDescriptor(
infiniopHandle_t handle,
infiniopOnesDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::ones::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::ones::NAMESPACE::Descriptor **>(desc_ptr), \
y_desc, \
{x_desc})
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_MOORE_API
CREATE(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CREATE
}
__C infiniStatus_t infiniopGetOnesWorkspaceSize(infiniopOnesDescriptor_t desc, size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::ones::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_MOORE_API
GET(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef GET
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniStatus_t infiniopOnes(
infiniopOnesDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::ones::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, y, {x}, stream)
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CALCULATE
}
__C infiniStatus_t
infiniopDestroyOnesDescriptor(infiniopOnesDescriptor_t desc) {
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::ones::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_MOORE_API
DELETE(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef DELETE
}
#include "zeros_cpu.h"
namespace op::zeros::cpu {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &x_desc = input_desc_vec.at(0);
const auto &y_shape = out_desc->shape();
const auto &x_shape = x_desc->shape();
CHECK_DTYPE(dtype,
INFINI_DTYPE_BYTE, // 1
INFINI_DTYPE_BOOL, // 2
INFINI_DTYPE_I8, // 3
INFINI_DTYPE_I16, // 4
INFINI_DTYPE_I32, // 5
INFINI_DTYPE_I64, // 6
INFINI_DTYPE_U8, // 7
INFINI_DTYPE_U16, // 8
INFINI_DTYPE_U32, // 9
INFINI_DTYPE_U64, // 10
INFINI_DTYPE_F8, // 11
INFINI_DTYPE_F16, // 12
INFINI_DTYPE_F32, // 13
INFINI_DTYPE_F64, // 14
INFINI_DTYPE_BF16, // 19
);
CHECK_SAME_SHAPE(y_shape, x_shape);
// create CPU elementwise descriptor
CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
switch (_dtype) {
case INFINI_DTYPE_BYTE: // 1
return _device_info->calculate<ZerosOp, uint8_t>(_info, output, inputs, stream);
case INFINI_DTYPE_BOOL: // 2
return _device_info->calculate<ZerosOp, bool>(_info, output, inputs, stream);
case INFINI_DTYPE_I8: // 3
return _device_info->calculate<ZerosOp, int8_t>(_info, output, inputs, stream);
case INFINI_DTYPE_I16: // 4
return _device_info->calculate<ZerosOp, int16_t>(_info, output, inputs, stream);
case INFINI_DTYPE_I32: // 5
return _device_info->calculate<ZerosOp, int32_t>(_info, output, inputs, stream);
case INFINI_DTYPE_I64: // 6
return _device_info->calculate<ZerosOp, int64_t>(_info, output, inputs, stream);
case INFINI_DTYPE_U8: // 7
return _device_info->calculate<ZerosOp, uint8_t>(_info, output, inputs, stream);
case INFINI_DTYPE_U16: // 8
return _device_info->calculate<ZerosOp, uint16_t>(_info, output, inputs, stream);
case INFINI_DTYPE_U32: // 9
return _device_info->calculate<ZerosOp, uint32_t>(_info, output, inputs, stream);
case INFINI_DTYPE_U64: // 10
return _device_info->calculate<ZerosOp, uint64_t>(_info, output, inputs, stream);
case INFINI_DTYPE_F8: // 11
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_F16: // 12
return _device_info->calculate<ZerosOp, fp16_t>(_info, output, inputs, stream);
case INFINI_DTYPE_F32: // 13
return _device_info->calculate<ZerosOp, float>(_info, output, inputs, stream);
case INFINI_DTYPE_F64: // 14
return _device_info->calculate<ZerosOp, double>(_info, output, inputs, stream);
case INFINI_DTYPE_C16: // 15
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_C32: // 16
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_C64: // 17
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_C128: // 18
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_BF16: // 19
return _device_info->calculate<ZerosOp, bf16_t>(_info, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::zeros::cpu
#ifndef __ZEROS_CPU_H__
#define __ZEROS_CPU_H__
#include "../../../elementwise/cpu/elementwise_cpu.h"
ELEMENTWISE_DESCRIPTOR(zeros, cpu)
namespace op::zeros::cpu {
typedef struct ZerosOp {
public:
static constexpr size_t num_inputs = 1;
template <typename T>
T operator()(const T &x) const {
return static_cast<T>(0.0);
}
} ZerosOp;
} // namespace op::zeros::cpu
#endif // __ZEROS_CPU_H__
#ifndef __ZEROS_CUDA_H__
#define __ZEROS_CUDA_H__
namespace op::zeros::cuda {
typedef struct ZerosOp {
public:
static constexpr size_t num_inputs = 1;
template <typename T>
__device__ __forceinline__ T operator()(const T &x) const {
if constexpr (std::is_same_v<T, bool>) { // 1
return false;
} else if constexpr (std::is_same_v<T, uint8_t>) { // 2
return 0;
} else if constexpr (std::is_same_v<T, int8_t>) { // 3
return 0;
} else if constexpr (std::is_same_v<T, int16_t>) { // 4
return 0;
} else if constexpr (std::is_same_v<T, int32_t>) { // 5
return 0;
} else if constexpr (std::is_same_v<T, int64_t>) { // 6
return 0;
} else if constexpr (std::is_same_v<T, uint8_t>) { // 7
return 0;
} else if constexpr (std::is_same_v<T, uint16_t>) { // 8
return 0;
} else if constexpr (std::is_same_v<T, uint32_t>) { // 9
return 0;
} else if constexpr (std::is_same_v<T, uint64_t>) { // 10
return 0;
} else if constexpr (std::is_same_v<T, cuda_fp8_e4m3>) { // 11
return cuda_fp8_e4m3(0.0f);
} else if constexpr (std::is_same_v<T, half>) { // 12
return __float2half(0.0f);
} else if constexpr (std::is_same_v<T, float>) { // 13
return 0.0f;
} else if constexpr (std::is_same_v<T, double>) { // 14
return 0.0;
} else if constexpr (std::is_same_v<T, cuda_bfloat16>) { // 19
return __float2bfloat16(0.0f);
} else {
return 0.0;
}
}
} ZerosOp;
} // namespace op::zeros::cuda
#endif // __ZEROS_CUDA_H__
#ifndef __ZEROS_METAX_API_H__
#define __ZEROS_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR(zeros, metax)
#endif // __ZEROS_METAX_API_H__
#include "zeros_metax.h"
#include "../../../elementwise/metax/elementwise_metax.h"
#include "../cuda/kernel.cuh"
namespace op::zeros::metax {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &x_desc = input_desc_vec.at(0);
const auto &y_shape = out_desc->shape();
const auto &x_shape = x_desc->shape();
CHECK_DTYPE(dtype,
INFINI_DTYPE_BYTE, // 1
INFINI_DTYPE_BOOL, // 2
INFINI_DTYPE_I8, // 3
INFINI_DTYPE_I16, // 4
INFINI_DTYPE_I32, // 5
INFINI_DTYPE_I64, // 6
INFINI_DTYPE_U8, // 7
INFINI_DTYPE_U16, // 8
INFINI_DTYPE_U32, // 9
INFINI_DTYPE_U64, // 10
INFINI_DTYPE_F8, // 11
INFINI_DTYPE_F16, // 12
INFINI_DTYPE_F32, // 13
INFINI_DTYPE_F64, // 14
INFINI_DTYPE_BF16, // 19
);
CHECK_SAME_SHAPE(y_shape, x_shape);
// create CUDA elementwise descriptor
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_BYTE: // 1
return _device_info->calculate<256, cuda::ZerosOp, bool>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BOOL: // 2
return _device_info->calculate<256, cuda::ZerosOp, uint8_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I8: // 3
return _device_info->calculate<256, cuda::ZerosOp, int8_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I16: // 4
return _device_info->calculate<256, cuda::ZerosOp, int16_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I32: // 5
return _device_info->calculate<256, cuda::ZerosOp, int32_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I64: // 6
return _device_info->calculate<256, cuda::ZerosOp, int64_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_U8: // 7
return _device_info->calculate<256, cuda::ZerosOp, uint8_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_U16: // 8
return _device_info->calculate<256, cuda::ZerosOp, uint16_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_U32: // 9
return _device_info->calculate<256, cuda::ZerosOp, uint32_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_U64: // 10
return _device_info->calculate<256, cuda::ZerosOp, uint64_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F8: // 11
return _device_info->calculate<256, cuda::ZerosOp, cuda_fp8_e4m3>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F16: // 12
return _device_info->calculate<256, cuda::ZerosOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32: // 13
return _device_info->calculate<256, cuda::ZerosOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64: // 14
return _device_info->calculate<256, cuda::ZerosOp, double>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_C16: // 15
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_C32: // 16
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_C64: // 17
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_C128: // 18
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_BF16: // 19
return _device_info->calculate<256, cuda::ZerosOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::zeros::metax
#ifndef __ZEROS_MOORE_API_H__
#define __ZEROS_MOORE_API_H__
#include "../../../elementwise/moore/elementwise_moore_api.h"
ELEMENTWISE_DESCRIPTOR(zeros, moore)
#endif // __ZEROS_MOORE_API_H__
#include "zeros_moore.h"
#include "../../../elementwise/moore/elementwise_moore.h"
#include "../cuda/kernel.cuh"
namespace op::zeros::moore {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &x_desc = input_desc_vec.at(0);
const auto &y_shape = out_desc->shape();
const auto &x_shape = x_desc->shape();
CHECK_DTYPE(dtype,
INFINI_DTYPE_BYTE, // 1
INFINI_DTYPE_BOOL, // 2
INFINI_DTYPE_I8, // 3
INFINI_DTYPE_I16, // 4
INFINI_DTYPE_I32, // 5
INFINI_DTYPE_I64, // 6
INFINI_DTYPE_U8, // 7
INFINI_DTYPE_U16, // 8
INFINI_DTYPE_U32, // 9
INFINI_DTYPE_U64, // 10
INFINI_DTYPE_F8, // 11
INFINI_DTYPE_F16, // 12
INFINI_DTYPE_F32, // 13
INFINI_DTYPE_F64, // 14
INFINI_DTYPE_BF16, // 19
);
CHECK_SAME_SHAPE(y_shape, x_shape);
// create MOORE elementwise descriptor
CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_BYTE: // 1
return _device_info->calculate<256, cuda::ZerosOp, bool>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BOOL: // 2
return _device_info->calculate<256, cuda::ZerosOp, uint8_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I8: // 3
return _device_info->calculate<256, cuda::ZerosOp, int8_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I16: // 4
return _device_info->calculate<256, cuda::ZerosOp, int16_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I32: // 5
return _device_info->calculate<256, cuda::ZerosOp, int32_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I64: // 6
return _device_info->calculate<256, cuda::ZerosOp, int64_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_U8: // 7
return _device_info->calculate<256, cuda::ZerosOp, uint8_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_U16: // 8
return _device_info->calculate<256, cuda::ZerosOp, uint16_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_U32: // 9
return _device_info->calculate<256, cuda::ZerosOp, uint32_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_U64: // 10
return _device_info->calculate<256, cuda::ZerosOp, uint64_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F8: // 11
return _device_info->calculate<256, cuda::ZerosOp, cuda_fp8_e4m3>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F16: // 12
return _device_info->calculate<256, cuda::ZerosOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32: // 13
return _device_info->calculate<256, cuda::ZerosOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64: // 14
return _device_info->calculate<256, cuda::ZerosOp, double>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_C16: // 15
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_C32: // 16
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_C64: // 17
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_C128: // 18
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_BF16: // 19
return _device_info->calculate<256, cuda::ZerosOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::zeros::moore
#ifndef __ZEROS_MOORE_KERNEL_H__
#define __ZEROS_MOORE_KERNEL_H__
#include <cuda_fp8.h>
namespace op::zeros::cuda {
typedef struct ZerosOp {
public:
static constexpr size_t num_inputs = 1;
template <typename T>
__device__ __forceinline__ T operator()(const T &x) const {
if constexpr (std::is_same_v<T, bool>) { // 1
return false;
} else if constexpr (std::is_same_v<T, uint8_t>) { // 2
return 0;
} else if constexpr (std::is_same_v<T, int8_t>) { // 3
return 0;
} else if constexpr (std::is_same_v<T, int16_t>) { // 4
return 0;
} else if constexpr (std::is_same_v<T, int32_t>) { // 5
return 0;
} else if constexpr (std::is_same_v<T, int64_t>) { // 6
return 0;
} else if constexpr (std::is_same_v<T, uint8_t>) { // 7
return 0;
} else if constexpr (std::is_same_v<T, uint16_t>) { // 8
return 0;
} else if constexpr (std::is_same_v<T, uint32_t>) { // 9
return 0;
} else if constexpr (std::is_same_v<T, uint64_t>) { // 10
return 0;
} else if constexpr (std::is_same_v<T, cuda_fp8_e4m3>) { // 11
return cuda_fp8_e4m3(0.0f);
} else if constexpr (std::is_same_v<T, half>) { // 12
return __float2half(0.0f);
} else if constexpr (std::is_same_v<T, float>) { // 13
return 0.0f;
} else if constexpr (std::is_same_v<T, double>) { // 14
return 0.0;
} else if constexpr (std::is_same_v<T, cuda_bfloat16>) { // 19
return __float2bfloat16(0.0f);
} else {
return 0.0;
}
}
} ZerosOp;
} // namespace op::zeros::cuda
#endif // __ZEROS_MOORE_KERNEL_H__
#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
#include "../cuda/kernel.cuh"
#include "zeros_nvidia.cuh"
namespace op::zeros::nvidia {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &x_desc = input_desc_vec.at(0);
const auto &y_shape = out_desc->shape();
const auto &x_shape = x_desc->shape();
CHECK_DTYPE(dtype,
INFINI_DTYPE_BYTE, // 1
INFINI_DTYPE_BOOL, // 2
INFINI_DTYPE_I8, // 3
INFINI_DTYPE_I16, // 4
INFINI_DTYPE_I32, // 5
INFINI_DTYPE_I64, // 6
INFINI_DTYPE_U8, // 7
INFINI_DTYPE_U16, // 8
INFINI_DTYPE_U32, // 9
INFINI_DTYPE_U64, // 10
INFINI_DTYPE_F8, // 11
INFINI_DTYPE_F16, // 12
INFINI_DTYPE_F32, // 13
INFINI_DTYPE_F64, // 14
INFINI_DTYPE_BF16, // 19
);
CHECK_SAME_SHAPE(y_shape, x_shape);
// create CUDA elementwise descriptor
CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_BYTE: // 1
return _device_info->calculate<256, cuda::ZerosOp, uint8_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BOOL: // 2
return _device_info->calculate<256, cuda::ZerosOp, bool>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I8: // 3
return _device_info->calculate<256, cuda::ZerosOp, int8_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I16: // 4
return _device_info->calculate<256, cuda::ZerosOp, int16_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I32: // 5
return _device_info->calculate<256, cuda::ZerosOp, int32_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I64: // 6
return _device_info->calculate<256, cuda::ZerosOp, int64_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_U8: // 7
return _device_info->calculate<256, cuda::ZerosOp, uint8_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_U16: // 8
return _device_info->calculate<256, cuda::ZerosOp, uint16_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_U32: // 9
return _device_info->calculate<256, cuda::ZerosOp, uint32_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_U64: // 10
return _device_info->calculate<256, cuda::ZerosOp, uint64_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F8: // 11
return _device_info->calculate<256, cuda::ZerosOp, cuda_fp8_e4m3>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F16: // 12
return _device_info->calculate<256, cuda::ZerosOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32: // 13
return _device_info->calculate<256, cuda::ZerosOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64: // 14
return _device_info->calculate<256, cuda::ZerosOp, double>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_C16: // 15
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_C32: // 16
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_C64: // 17
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_C128: // 18
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_BF16: // 19
return _device_info->calculate<256, cuda::ZerosOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::zeros::nvidia
#ifndef __ZEROS_CUDA_API_H__
#define __ZEROS_CUDA_API_H__
#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
ELEMENTWISE_DESCRIPTOR(zeros, nvidia)
#endif // __ZEROS_CUDA_API_H__
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/zeros.h"
#ifdef ENABLE_CPU_API
#include "cpu/zeros_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
#include "nvidia/zeros_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
#include "metax/zeros_metax.h"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/zeros_moore.h"
#endif
__C infiniStatus_t infiniopCreateZerosDescriptor(
infiniopHandle_t handle,
infiniopZerosDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::zeros::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::zeros::NAMESPACE::Descriptor **>(desc_ptr), \
y_desc, \
{x_desc})
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_MOORE_API
CREATE(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CREATE
}
__C infiniStatus_t infiniopGetZerosWorkspaceSize(infiniopZerosDescriptor_t desc, size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::zeros::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_MOORE_API
GET(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef GET
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__C infiniStatus_t infiniopZeros(
infiniopZerosDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::zeros::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, y, {x}, stream)
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CALCULATE
}
__C infiniStatus_t
infiniopDestroyZerosDescriptor(infiniopZerosDescriptor_t desc) {
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::zeros::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_MOORE_API
DELETE(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef DELETE
}
from ast import List
import numpy as np
import gguf
from typing import List
from numpy.lib.stride_tricks import as_strided
from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor
def ones(x: np.ndarray):
return np.ones_like(x)
class OnesTestCase(InfiniopTestCase):
def __init__(self,
x: np.ndarray,
shape_x: List[int] | None,
stride_x: List[int] | None,
y: np.ndarray,
shape_y: List[int] | None,
stride_y: List[int] | None
):
super().__init__("ones")
self.x = x
self.shape_x = shape_x
self.stride_x = stride_x
self.y = y
self.shape_y = shape_y
self.stride_y = stride_y
def write_test(self, test_writer: "InfiniopTestWriter"):
super().write_test(test_writer)
if self.shape_x is not None:
test_writer.add_array(test_writer.gguf_key("x.shape"), self.shape_x)
if self.shape_y is not None:
test_writer.add_array(test_writer.gguf_key("y.shape"), self.shape_y)
if self.stride_x is not None:
test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x))
test_writer.add_array(
test_writer.gguf_key("y.strides"),
gguf_strides(*self.stride_y if self.stride_y is not None else contiguous_gguf_strides(self.shape_y))
)
test_writer.add_tensor(
test_writer.gguf_key("x"), self.x, raw_dtype=np_dtype_to_ggml(self.x.dtype)
)
test_writer.add_tensor(
test_writer.gguf_key("y"), self.y, raw_dtype=np_dtype_to_ggml(self.y.dtype)
)
ans = ones(
self.x.astype(np.float64),
)
test_writer.add_tensor(
test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64
)
if __name__ == "__main__":
test_writer = InfiniopTestWriter("ones.gguf")
test_cases = []
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES_ = [
# shape, x_stride, y_stride
((13, 4), None, None),
((13, 4), (10, 1), (10, 1)),
((13, 4, 4), None, None),
((13, 4, 4), (20, 4, 1), (20, 4, 1)),
((16, 5632), None, None),
((16, 5632), (13312, 1), (13312, 1)),
((4, 4, 5632), None, None),
((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
]
_TENSOR_DTYPES_ = [np.bool_, # 2
np.int8, # 3
np.int16, # 4
np.int32, # 5
np.int64, # 6
# np.uint8, # 7
# np.uint16, # 8
# np.uint32, # 9
# np.uint64, # 10
# InfiniDtype.F8, # 11
np.float16, # 12
np.float32, # 13
np.float64, # 14
# InfiniDtype.BF16, # 19
]
for dtype in _TENSOR_DTYPES_:
for shape, stride_x, stride_y in _TEST_CASES_:
x = np.random.rand(*shape).astype(dtype)
y = np.empty(tuple(0 for _ in shape), dtype=dtype)
x = process_zero_stride_tensor(x, stride_x)
test_case = OnesTestCase(
x=x,
shape_x=shape,
stride_x=stride_x,
y=y,
shape_y=shape,
stride_y=stride_y,
)
test_cases.append(test_case)
test_writer.add_tests(test_cases)
test_writer.save()
from ast import List
import numpy as np
import gguf
from typing import List
from numpy.lib.stride_tricks import as_strided
from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor
def zeros(x: np.ndarray):
return np.zeros_like(x)
class ZerosTestCase(InfiniopTestCase):
def __init__(self,
x: np.ndarray,
shape_x: List[int] | None,
stride_x: List[int] | None,
y: np.ndarray,
shape_y: List[int] | None,
stride_y: List[int] | None
):
super().__init__("zeros")
self.x = x
self.shape_x = shape_x
self.stride_x = stride_x
self.y = y
self.shape_y = shape_y
self.stride_y = stride_y
def write_test(self, test_writer: "InfiniopTestWriter"):
super().write_test(test_writer)
if self.shape_x is not None:
test_writer.add_array(test_writer.gguf_key("x.shape"), self.shape_x)
if self.shape_y is not None:
test_writer.add_array(test_writer.gguf_key("y.shape"), self.shape_y)
if self.stride_x is not None:
test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x))
test_writer.add_array(
test_writer.gguf_key("y.strides"),
gguf_strides(*self.stride_y if self.stride_y is not None else contiguous_gguf_strides(self.shape_y))
)
# print(test_writer)
test_writer.add_tensor(
test_writer.gguf_key("x"), self.x, raw_dtype=np_dtype_to_ggml(self.x.dtype)
)
test_writer.add_tensor(
test_writer.gguf_key("y"), self.y, raw_dtype=np_dtype_to_ggml(self.y.dtype)
)
ans = zeros(
self.x.astype(np.float64),
)
test_writer.add_tensor(
test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64
)
if __name__ == "__main__":
test_writer = InfiniopTestWriter("zeros.gguf")
test_cases = []
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES_ = [
# shape, x_stride, y_stride
((13, 4), None, None),
((13, 4), (10, 1), (10, 1)),
((13, 4, 4), None, None),
((13, 4, 4), (20, 4, 1), (20, 4, 1)),
((16, 5632), None, None),
((16, 5632), (13312, 1), (13312, 1)),
((4, 4, 5632), None, None),
((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
]
_TENSOR_DTYPES_ = [np.bool_, # 2
np.int8, # 3
np.int16, # 4
np.int32, # 5
np.int64, # 6
# np.uint8, # 7
# np.uint16, # 8
# np.uint32, # 9
# np.uint64, # 10
# InfiniDtype.F8, # 11
np.float16, # 12
np.float32, # 13
np.float64, # 14
# InfiniDtype.BF16, # 19
]
for dtype in _TENSOR_DTYPES_:
for shape, stride_x, stride_y in _TEST_CASES_:
x = np.random.rand(*shape).astype(dtype)
y = np.empty(tuple(0 for _ in shape), dtype=dtype)
x = process_zero_stride_tensor(x, stride_x)
test_case = ZerosTestCase(
x=x,
shape_x=shape,
stride_x=stride_x,
y=y,
shape_y=shape,
stride_y=stride_y,
)
test_cases.append(test_case)
test_writer.add_tests(test_cases)
test_writer.save()
......@@ -673,3 +673,66 @@ def softplus_(lib):
]
lib.infiniopDestroySoftplusDescriptor.restype = c_int32
lib.infiniopDestroySoftplusDescriptor.argtypes = [infiniopOperatorDescriptor_t]
@OpRegister.operator
def zeros_(lib):
lib.infiniopCreateZerosDescriptor.restype = c_int32
lib.infiniopCreateZerosDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetZerosWorkspaceSize.restype = c_int32
lib.infiniopGetZerosWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopZeros.restype = c_int32
lib.infiniopZeros.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyZerosDescriptor.restype = c_int32
lib.infiniopDestroyZerosDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator
def ones_(lib):
lib.infiniopCreateOnesDescriptor.restype = c_int32
lib.infiniopCreateOnesDescriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetOnesWorkspaceSize.restype = c_int32
lib.infiniopGetOnesWorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopOnes.restype = c_int32
lib.infiniopOnes.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyOnesDescriptor.restype = c_int32
lib.infiniopDestroyOnesDescriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
......@@ -51,6 +51,8 @@ class TestTensor(CTensor):
scale=None,
bias=None,
set_tensor=None,
randint_low=None,
randint_high=None,
):
self.dt = dt
self.device = device
......@@ -80,7 +82,11 @@ class TestTensor(CTensor):
torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device]
)
elif mode == "randint":
self._torch_tensor = torch.randint(-2000000000,2000000000, torch_shape,dtype=to_torch_dtype(dt), device=torch_device_map[device])
randint_low = -2000000000 if randint_low is None else randint_low
randint_high = 2000000000 if randint_high is None else randint_high
self._torch_tensor = torch.randint(randint_low,randint_high, torch_shape,dtype=to_torch_dtype(dt), device=torch_device_map[device])
elif mode == "float8_e4m3fn":
self._torch_tensor = torch.rand(shape, dtype=torch.float32, device=torch_device_map[device]).to(dtype=torch.float8_e4m3fn)
elif mode == "manual":
assert set_tensor is not None
assert torch_shape == list(set_tensor.shape)
......@@ -139,7 +145,11 @@ class TestTensor(CTensor):
def to_torch_dtype(dt: InfiniDtype, compatability_mode=False):
if dt == InfiniDtype.I8:
if dt == InfiniDtype.BOOL:
return torch.bool
elif dt == InfiniDtype.BYTE:
return torch.uint8
elif dt == InfiniDtype.I8:
return torch.int8
elif dt == InfiniDtype.I16:
return torch.int16
......@@ -165,6 +175,8 @@ def to_torch_dtype(dt: InfiniDtype, compatability_mode=False):
return torch.int32 if compatability_mode else torch.uint32
elif dt == InfiniDtype.U64:
return torch.int64 if compatability_mode else torch.uint64
elif dt == InfiniDtype.F8:
return torch.float8_e4m3fn
else:
raise ValueError("Unsupported data type")
......@@ -269,7 +281,21 @@ def rearrange_tensor(tensor, new_strides):
new_positions += offset
# Copy the original data to the new tensor
new_tensor.view(-1).index_add_(0, new_positions, tensor.view(-1))
if tensor.dtype in [torch.bool, torch.uint8, torch.int8, torch.int16, torch.int32,torch.int64, torch.float16,torch.bfloat16,torch.float32,torch.float64]:
new_tensor.view(-1).index_add_(0, new_positions, tensor.view(-1))
elif tensor.dtype in [torch.uint16, torch.uint32, torch.uint64]:
new_tensor_int64 = new_tensor.to(dtype=torch.int64)
tensor_int64 = tensor.to(dtype=torch.int64)
new_tensor_int64.view(-1).index_add_(0, new_positions, tensor_int64.view(-1))
new_tensor = new_tensor_int64.to(dtype=tensor.dtype)
elif tensor.dtype in [torch.float8_e4m3fn]:
new_tensor_float64 = new_tensor.to(dtype=torch.float64)
tensor_float64 = tensor.to(dtype=torch.float64)
new_tensor_float64.view(-1).index_add_(0, new_positions, tensor_float64.view(-1))
new_tensor = new_tensor_float64.to(dtype=tensor.dtype)
else:
raise ValueError("Unsupported data type")
new_tensor.set_(new_tensor.untyped_storage(), offset, shape, tuple(new_strides))
return new_tensor
......@@ -484,11 +510,12 @@ def print_discrepancy(
nan_mismatch = (
actual_isnan ^ expected_isnan if equal_nan else actual_isnan | expected_isnan
)
diff_mask = nan_mismatch | (
torch.abs(actual - expected) > (atol + rtol * torch.abs(expected))
torch.abs(actual.to(dtype=torch.float64) - expected.to(dtype=torch.float64)) > (atol + rtol * torch.abs(expected.to(dtype=torch.float64)))
)
diff_indices = torch.nonzero(diff_mask, as_tuple=False)
delta = actual - expected
delta = actual.to(dtype=torch.float64) - expected.to(dtype=torch.float64)
# Display format: widths for columns
col_width = [18, 20, 20, 20]
......
import torch
import ctypes
from ctypes import c_uint64
from libinfiniop import (
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
from enum import Enum, auto
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES_ = [
# shape, x_stride, y_stride
((13, 4), None, None),
((13, 4), (10, 1), (10, 1)),
((13, 4), (0, 1), (0, 1)),
((13, 4, 4), None, None),
((13, 4, 4), (20, 4, 1), (20, 4, 1)),
((16, 5632), None, None),
((16, 5632), (13312, 1), (13312, 1)),
((4, 4, 5632), None, None),
((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
]
class Inplace(Enum):
OUT_OF_PLACE = auto()
INPLACE_X = auto()
# Inplace options applied for each test case in _TEST_CASES_
_INPLACE = [
Inplace.OUT_OF_PLACE,
Inplace.INPLACE_X,
]
# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
_TEST_CASES = [
test_case + (inplace_item,)
for test_case in _TEST_CASES_
for inplace_item in _INPLACE
]
# Data types used for testing
_TENSOR_DTYPES = [InfiniDtype.BYTE, # 1
InfiniDtype.BOOL, # 2
InfiniDtype.I8, # 3
InfiniDtype.I16, # 4
InfiniDtype.I32, # 5
InfiniDtype.I64, # 6
InfiniDtype.U8, # 7
# InfiniDtype.U16, # 8
# InfiniDtype.U32, # 9
# InfiniDtype.U64, # 10
# InfiniDtype.F8, # 11
InfiniDtype.F16, # 12
InfiniDtype.F32, # 13
InfiniDtype.F64, # 14
InfiniDtype.BF16, # 19
]
# Tolerance map for different data types
_TOLERANCE_MAP = {
InfiniDtype.BYTE: {"atol": 1e-3, "rtol": 1e-3}, # 1
InfiniDtype.BOOL: {"atol": 1e-3, "rtol": 1e-3}, # 2
InfiniDtype.I8: {"atol": 1e-3, "rtol": 1e-3}, # 3
InfiniDtype.I16: {"atol": 1e-3, "rtol": 1e-3}, # 4
InfiniDtype.I32: {"atol": 1e-3, "rtol": 1e-3}, # 5
InfiniDtype.I64: {"atol": 1e-3, "rtol": 1e-3}, # 6
InfiniDtype.U8: {"atol": 1e-3, "rtol": 1e-3}, # 7
InfiniDtype.U16: {"atol": 1e-3, "rtol": 1e-3}, # 8
InfiniDtype.U32: {"atol": 1e-3, "rtol": 1e-3}, # 9
InfiniDtype.U64: {"atol": 1e-3, "rtol": 1e-3}, # 10
InfiniDtype.F8: {"atol": 1e-3, "rtol": 1e-3}, # 11
InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, # 12
InfiniDtype.F32: {"atol": 1e-3, "rtol": 1e-3}, # 13
InfiniDtype.F64: {"atol": 1e-3, "rtol": 1e-3}, # 14
InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3}, # 19
}
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
def torch_ones(y, x):
# y[...] = 1
y.copy_(torch.ones_like(y))
def test(
handle,
device,
shape,
x_stride=None,
y_stride=None,
inplace=Inplace.OUT_OF_PLACE,
dtype=None,
sync=None,
):
if dtype in [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32, InfiniDtype.F64]:
x = TestTensor(shape, x_stride, dtype, device)
elif dtype in [InfiniDtype.BYTE, InfiniDtype.U8, InfiniDtype.U16, InfiniDtype.U32, InfiniDtype.U64,
InfiniDtype.I8, InfiniDtype.I16, InfiniDtype.I32, InfiniDtype.I64]:
x = TestTensor(shape, x_stride, dtype, device, mode="randint", randint_low=0, randint_high=16)
elif dtype in [InfiniDtype.F8]:
x = TestTensor(shape, x_stride, dtype, device, mode="float8_e4m3fn")
elif dtype in [InfiniDtype.BOOL]:
x = TestTensor(shape, x_stride, dtype, device, mode="randint", randint_low=0, randint_high=2)
else:
raise ValueError("Unsupported dtype")
if inplace == Inplace.INPLACE_X:
if x_stride != y_stride:
return
y = x
else:
y = TestTensor(shape, y_stride, dtype, device, mode="ones")
if y.is_broadcast():
return
print(
f"Testing Ones on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} "
f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
)
torch_ones(y.torch_tensor(), x.torch_tensor())
if sync is not None:
sync()
descriptor = infiniopOperatorDescriptor_t()
check_error(
LIBINFINIOP.infiniopCreateOnesDescriptor(
handle,
ctypes.byref(descriptor),
y.descriptor,
x.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [y, x]:
tensor.destroy_desc()
workspace_size = c_uint64(0)
check_error(
LIBINFINIOP.infiniopGetOnesWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = TestWorkspace(workspace_size.value, y.device)
def lib_ones():
check_error(
LIBINFINIOP.infiniopOnes(
descriptor,
workspace.data(),
workspace.size(),
y.data(),
x.data(),
None,
)
)
lib_ones()
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
assert torch.allclose(y.actual_tensor().to(dtype=torch.float32), y.torch_tensor().to(dtype=torch.float32), atol=atol, rtol=rtol)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: torch_ones(y.torch_tensor(), x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_ones(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(LIBINFINIOP.infiniopDestroyOnesDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
import torch
import ctypes
from ctypes import c_uint64
from libinfiniop import (
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
from enum import Enum, auto
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES_ = [
# shape, x_stride, y_stride
((13, 4), None, None),
((13, 4), (10, 1), (10, 1)),
((13, 4), (0, 1), (0, 1)),
((13, 4, 4), None, None),
((13, 4, 4), (20, 4, 1), (20, 4, 1)),
((16, 5632), None, None),
((16, 5632), (13312, 1), (13312, 1)),
((4, 4, 5632), None, None),
((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
]
class Inplace(Enum):
OUT_OF_PLACE = auto()
INPLACE_X = auto()
# Inplace options applied for each test case in _TEST_CASES_
_INPLACE = [
Inplace.OUT_OF_PLACE,
Inplace.INPLACE_X,
]
# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
_TEST_CASES = [
test_case + (inplace_item,)
for test_case in _TEST_CASES_
for inplace_item in _INPLACE
]
# Data types used for testing
_TENSOR_DTYPES = [
InfiniDtype.BYTE, # 1
InfiniDtype.BOOL, # 2
InfiniDtype.I8, # 3
InfiniDtype.I16, # 4
InfiniDtype.I32, # 5
InfiniDtype.I64, # 6
InfiniDtype.U8, # 7
# InfiniDtype.U16, # 8
# InfiniDtype.U32, # 9
# InfiniDtype.U64, # 10
# InfiniDtype.F8, # 11
InfiniDtype.F16, # 12
InfiniDtype.F32, # 13
InfiniDtype.F64, # 14
InfiniDtype.BF16, # 19
]
# Tolerance map for different data types
_TOLERANCE_MAP = {
InfiniDtype.BYTE: {"atol": 1e-3, "rtol": 1e-3}, # 1
InfiniDtype.BOOL: {"atol": 1e-3, "rtol": 1e-3}, # 2
InfiniDtype.I8: {"atol": 1e-3, "rtol": 1e-3}, # 3
InfiniDtype.I16: {"atol": 1e-3, "rtol": 1e-3}, # 4
InfiniDtype.I32: {"atol": 1e-3, "rtol": 1e-3}, # 5
InfiniDtype.I64: {"atol": 1e-3, "rtol": 1e-3}, # 6
InfiniDtype.U8: {"atol": 1e-3, "rtol": 1e-3}, # 7
InfiniDtype.U16: {"atol": 1e-3, "rtol": 1e-3}, # 8
InfiniDtype.U32: {"atol": 1e-3, "rtol": 1e-3}, # 9
InfiniDtype.U64: {"atol": 1e-3, "rtol": 1e-3}, # 10
InfiniDtype.F8: {"atol": 1e-3, "rtol": 1e-3}, # 11
InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, # 12
InfiniDtype.F32: {"atol": 1e-3, "rtol": 1e-3}, # 13
InfiniDtype.F64: {"atol": 1e-3, "rtol": 1e-3}, # 14
InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3}, # 19
}
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
def torch_zeros(y, x):
# y[...] = 0
y.copy_(torch.zeros_like(y))
def test(
handle,
device,
shape,
x_stride=None,
y_stride=None,
inplace=Inplace.OUT_OF_PLACE,
dtype=None,
sync=None,
):
if dtype in [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32, InfiniDtype.F64]:
x = TestTensor(shape, x_stride, dtype, device)
elif dtype in [InfiniDtype.BYTE, InfiniDtype.U8, InfiniDtype.U16, InfiniDtype.U32, InfiniDtype.U64,
InfiniDtype.I8, InfiniDtype.I16, InfiniDtype.I32, InfiniDtype.I64]:
x = TestTensor(shape, x_stride, dtype, device, mode="randint", randint_low=0, randint_high=16)
elif dtype in [InfiniDtype.F8]:
x = TestTensor(shape, x_stride, dtype, device, mode="float8_e4m3fn")
elif dtype in [InfiniDtype.BOOL]:
x = TestTensor(shape, x_stride, dtype, device, mode="randint", randint_low=0, randint_high=2)
else:
raise ValueError("Unsupported dtype")
if inplace == Inplace.INPLACE_X:
if x_stride != y_stride:
return
y = x
else:
y = TestTensor(shape, y_stride, dtype, device, mode="ones")
if y.is_broadcast():
return
print(
f"Testing Zeros on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} "
f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
)
torch_zeros(y.torch_tensor(), x.torch_tensor())
if sync is not None:
sync()
descriptor = infiniopOperatorDescriptor_t()
check_error(
LIBINFINIOP.infiniopCreateZerosDescriptor(
handle,
ctypes.byref(descriptor),
y.descriptor,
x.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
for tensor in [y, x]:
tensor.destroy_desc()
workspace_size = c_uint64(0)
check_error(
LIBINFINIOP.infiniopGetZerosWorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = TestWorkspace(workspace_size.value, y.device)
def lib_zeros():
check_error(
LIBINFINIOP.infiniopZeros(
descriptor,
workspace.data(),
workspace.size(),
y.data(),
x.data(),
None,
)
)
lib_zeros()
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
assert torch.allclose(y.actual_tensor().to(dtype=torch.float32), y.torch_tensor().to(dtype=torch.float32), atol=atol, rtol=rtol)
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: torch_zeros(y.torch_tensor(), x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_zeros(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(LIBINFINIOP.infiniopDestroyZerosDescriptor(descriptor))
if __name__ == "__main__":
args = get_args()
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment