Unverified Commit 9b8de584 authored by pengcheng888's avatar pengcheng888 Committed by GitHub
Browse files

issue/473 - the ones and zeros operators


Co-authored-by: default avatarpengcheng888 <pengcheng@example.com>
parent f5e6d729
......@@ -20,6 +20,8 @@
#include "infiniop/ops/sub.h"
#include "infiniop/ops/swiglu.h"
#include "infiniop/ops/topkrouter.h"
#include "infiniop/ops/zeros.h"
#include "infiniop/ops/ones.h"
#include "infiniop/ops/topksoftmax.h"
#include "infiniop/ops/sigmoid.h"
#include "infiniop/tensor_descriptor.h"
......
#ifndef __INFINIOP_ONES_API_H__
#define __INFINIOP_ONES_API_H__
#include "../operator_descriptor.h"
typedef struct InfiniopDescriptor *infiniopOnesDescriptor_t;
__C __export infiniStatus_t infiniopCreateOnesDescriptor(infiniopHandle_t handle,
infiniopOnesDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x);
__C __export infiniStatus_t infiniopGetOnesWorkspaceSize(infiniopOnesDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopOnes(infiniopOnesDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
void *stream);
__C __export infiniStatus_t infiniopDestroyOnesDescriptor(infiniopOnesDescriptor_t desc);
#endif
#ifndef __INFINIOP_ZEROS_API_H__
#define __INFINIOP_ZEROS_API_H__
#include "../operator_descriptor.h"
typedef struct InfiniopDescriptor *infiniopZerosDescriptor_t;
__C __export infiniStatus_t infiniopCreateZerosDescriptor(infiniopHandle_t handle,
infiniopZerosDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x);
__C __export infiniStatus_t infiniopGetZerosWorkspaceSize(infiniopZerosDescriptor_t desc, size_t *size);
__C __export infiniStatus_t infiniopZeros(infiniopZerosDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
void *stream);
__C __export infiniStatus_t infiniopDestroyZerosDescriptor(infiniopZerosDescriptor_t desc);
#endif
......@@ -26,6 +26,8 @@ def run_tests(args):
"sub.py",
"swiglu.py",
"softplus.py",
"ones.py",
"zeros.py",
"sigmoid.py",
"topkrouter.py",
"topksoftmax.py",
......
......@@ -16,6 +16,9 @@ DECLARE_INFINIOP_TEST(add)
DECLARE_INFINIOP_TEST(causal_softmax)
DECLARE_INFINIOP_TEST(rearrange)
DECLARE_INFINIOP_TEST(sub)
DECLARE_INFINIOP_TEST(zeros)
DECLARE_INFINIOP_TEST(ones)
DECLARE_INFINIOP_TEST(sigmoid)
DECLARE_INFINIOP_TEST(topkrouter)
DECLARE_INFINIOP_TEST(topksoftmax)
......@@ -45,6 +48,8 @@ DECLARE_INFINIOP_TEST(topksoftmax)
REGISTER_INFINIOP_TEST(causal_softmax) \
REGISTER_INFINIOP_TEST(rearrange) \
REGISTER_INFINIOP_TEST(sub) \
REGISTER_INFINIOP_TEST(zeros) \
REGISTER_INFINIOP_TEST(ones) \
REGISTER_INFINIOP_TEST(sigmoid) \
REGISTER_INFINIOP_TEST(topkrouter) \
REGISTER_INFINIOP_TEST(topksoftmax) \
......
#include "ops.hpp"
#include "utils.hpp"
#include <infinirt.h>
#include <iomanip>
#include <iostream>
namespace infiniop_test::ones {
struct Test::Attributes {
std::shared_ptr<Tensor> x;
std::shared_ptr<Tensor> y;
std::shared_ptr<Tensor> ans;
};
std::shared_ptr<Test> Test::build(
std::unordered_map<std::string, std::vector<uint8_t>> attributes,
std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
double rtol, double atol) {
auto test = std::shared_ptr<Test>(new Test(rtol, atol));
test->_attributes = new Attributes();
if (tensors.find("x") == tensors.end()
|| tensors.find("y") == tensors.end()
|| tensors.find("ans") == tensors.end()) {
throw std::runtime_error("Invalid Test");
}
test->_attributes->x = tensors["x"];
test->_attributes->y = tensors["y"];
test->_attributes->ans = tensors["ans"];
return test;
}
std::shared_ptr<infiniop_test::Result> Test::run(
infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
infiniopOnesDescriptor_t op_desc;
auto x = _attributes->x->to(device, device_id);
auto y = _attributes->y->to(device, device_id);
CHECK_OR(infiniopCreateOnesDescriptor(handle, &op_desc,
y->desc(),
x->desc()),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
size_t workspace_size;
CHECK_OR(infiniopGetOnesWorkspaceSize(op_desc, &workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
void *workspace;
CHECK_OR(infinirtMalloc(&workspace, workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
CHECK_OR(infiniopOnes(op_desc, workspace, workspace_size,
y->data(),
x->data(),
nullptr),
return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
try {
allClose(y, _attributes->ans, _rtol, _atol);
} catch (const std::exception &e) {
return TEST_FAILED(RESULT_INCORRECT, e.what());
}
double elapsed_time = 0.;
elapsed_time = benchmark(
[=]() {
infiniopOnes(
op_desc, workspace, workspace_size,
y->data(),
x->data(),
nullptr);
},
warm_ups, iterations);
return TEST_PASSED(elapsed_time);
}
std::vector<std::string> Test::attribute_names() {
return {};
}
std::vector<std::string> Test::tensor_names() {
return {"x", "y", "ans"};
}
std::vector<std::string> Test::output_names() {
return {"y"};
}
std::string Test::toString() const {
std::ostringstream oss;
oss << op_name() << std::endl;
oss << "- x: " << _attributes->x->info() << std::endl;
oss << "- y: " << _attributes->y->info() << std::endl;
oss << std::scientific << std::setprecision(2);
oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
return oss.str();
}
Test::~Test() {
delete _attributes;
}
} // namespace infiniop_test::ones
#include "ops.hpp"
#include "utils.hpp"
#include <infinirt.h>
#include <iomanip>
#include <iostream>
namespace infiniop_test::zeros {
struct Test::Attributes {
std::shared_ptr<Tensor> x;
std::shared_ptr<Tensor> y;
std::shared_ptr<Tensor> ans;
};
std::shared_ptr<Test> Test::build(
std::unordered_map<std::string, std::vector<uint8_t>> attributes,
std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
double rtol, double atol) {
auto test = std::shared_ptr<Test>(new Test(rtol, atol));
test->_attributes = new Attributes();
if (tensors.find("x") == tensors.end()
|| tensors.find("y") == tensors.end()
|| tensors.find("ans") == tensors.end()) {
throw std::runtime_error("Invalid Test");
}
test->_attributes->x = tensors["x"];
test->_attributes->y = tensors["y"];
test->_attributes->ans = tensors["ans"];
return test;
}
std::shared_ptr<infiniop_test::Result> Test::run(
infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
infiniopZerosDescriptor_t op_desc;
auto x = _attributes->x->to(device, device_id);
auto y = _attributes->y->to(device, device_id);
CHECK_OR(infiniopCreateZerosDescriptor(handle, &op_desc,
y->desc(),
x->desc()),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
size_t workspace_size;
CHECK_OR(infiniopGetZerosWorkspaceSize(op_desc, &workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
void *workspace;
CHECK_OR(infinirtMalloc(&workspace, workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
CHECK_OR(infiniopZeros(op_desc, workspace, workspace_size,
y->data(),
x->data(),
nullptr),
return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
try {
allClose(y, _attributes->ans, _rtol, _atol);
} catch (const std::exception &e) {
return TEST_FAILED(RESULT_INCORRECT, e.what());
}
double elapsed_time = 0.;
elapsed_time = benchmark(
[=]() {
infiniopZeros(
op_desc, workspace, workspace_size,
y->data(),
x->data(),
nullptr);
},
warm_ups, iterations);
return TEST_PASSED(elapsed_time);
}
std::vector<std::string> Test::attribute_names() {
return {};
}
std::vector<std::string> Test::tensor_names() {
return {"x", "y", "ans"};
}
std::vector<std::string> Test::output_names() {
return {"y"};
}
std::string Test::toString() const {
std::ostringstream oss;
oss << op_name() << std::endl;
oss << "- x: " << _attributes->x->info() << std::endl;
oss << "- y: " << _attributes->y->info() << std::endl;
oss << std::scientific << std::setprecision(2);
oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
return oss.str();
}
Test::~Test() {
delete _attributes;
}
} // namespace infiniop_test::zeros
#define INFINIOP_METAX_KERNEL __global__ void
#include <hpcc_fp8.h>
// Posible maximum number of threads per block for METAX architectures
// Used for picking correct kernel launch configuration
#define METAX_BLOCK_SIZE_1024 1024
......@@ -9,6 +11,7 @@
using cuda_bfloat16 = hpcc_bfloat16;
using cuda_bfloat162 = hpcc_bfloat162;
using cuda_fp8_e4m3 = __hpcc_fp8_e4m3;
namespace device::metax {
......
......@@ -2,6 +2,7 @@
#include <musa_bf16.h>
#include <musa_fp16.h>
#include <musa_fp8.h>
// Posible maximum number of threads per block for MUSA architectures
// Used for picking correct kernel launch configuration
......@@ -13,6 +14,7 @@
using cuda_bfloat16 = mt_bfloat16;
using cuda_bfloat162 = mt_bfloat162;
using cuda_fp8_e4m3 = __mt_fp8_e4m3;
namespace device::moore {
......
......@@ -9,6 +9,7 @@
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <cuda_fp8.h>
// Posible maximum number of threads per block for CUDA architectures
// Used for picking correct kernel launch configuration
......@@ -25,6 +26,7 @@ using cuda_bfloat162 = __nv_bfloat162;
#else
using cuda_bfloat16 = nv_bfloat16;
using cuda_bfloat162 = nv_bfloat162;
using cuda_fp8_e4m3 = __nv_fp8_e4m3;
#endif
namespace device::nvidia {
......
#include "ones_cpu.h"
namespace op::ones::cpu {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &x_desc = input_desc_vec.at(0);
const auto &y_shape = out_desc->shape();
const auto &x_shape = x_desc->shape();
CHECK_DTYPE(dtype,
INFINI_DTYPE_BYTE, // 1
INFINI_DTYPE_BOOL, // 2
INFINI_DTYPE_I8, // 3
INFINI_DTYPE_I16, // 4
INFINI_DTYPE_I32, // 5
INFINI_DTYPE_I64, // 6
INFINI_DTYPE_U8, // 7
INFINI_DTYPE_U16, // 8
INFINI_DTYPE_U32, // 9
INFINI_DTYPE_U64, // 10
INFINI_DTYPE_F8, // 11
INFINI_DTYPE_F16, // 12
INFINI_DTYPE_F32, // 13
INFINI_DTYPE_F64, // 14
INFINI_DTYPE_BF16, // 19
);
CHECK_SAME_SHAPE(y_shape, x_shape);
// create CPU elementwise descriptor
CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
switch (_dtype) {
case INFINI_DTYPE_BYTE: // 1
return _device_info->calculate<OnesOp, uint8_t>(_info, output, inputs, stream);
case INFINI_DTYPE_BOOL: // 2
return _device_info->calculate<OnesOp, bool>(_info, output, inputs, stream);
case INFINI_DTYPE_I8: // 3
return _device_info->calculate<OnesOp, int8_t>(_info, output, inputs, stream);
case INFINI_DTYPE_I16: // 4
return _device_info->calculate<OnesOp, int16_t>(_info, output, inputs, stream);
case INFINI_DTYPE_I32: // 5
return _device_info->calculate<OnesOp, int32_t>(_info, output, inputs, stream);
case INFINI_DTYPE_I64: // 6
return _device_info->calculate<OnesOp, int64_t>(_info, output, inputs, stream);
case INFINI_DTYPE_U8: // 7
return _device_info->calculate<OnesOp, uint8_t>(_info, output, inputs, stream);
case INFINI_DTYPE_U16: // 8
return _device_info->calculate<OnesOp, uint16_t>(_info, output, inputs, stream);
case INFINI_DTYPE_U32: // 9
return _device_info->calculate<OnesOp, uint32_t>(_info, output, inputs, stream);
case INFINI_DTYPE_U64: // 10
return _device_info->calculate<OnesOp, uint64_t>(_info, output, inputs, stream);
case INFINI_DTYPE_F8: // 11
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_F16: // 12
return _device_info->calculate<OnesOp, fp16_t>(_info, output, inputs, stream);
case INFINI_DTYPE_F32: // 13
return _device_info->calculate<OnesOp, float>(_info, output, inputs, stream);
case INFINI_DTYPE_F64: // 14
return _device_info->calculate<OnesOp, double>(_info, output, inputs, stream);
case INFINI_DTYPE_C16: // 15
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_C32: // 16
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_C64: // 17
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_C128: // 18
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_BF16: // 19
return _device_info->calculate<OnesOp, bf16_t>(_info, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::ones::cpu
#ifndef __ONES_CPU_H__
#define __ONES_CPU_H__
#include "../../../elementwise/cpu/elementwise_cpu.h"
ELEMENTWISE_DESCRIPTOR(ones, cpu)
namespace op::ones::cpu {
typedef struct OnesOp {
public:
static constexpr size_t num_inputs = 1;
template <typename T>
T operator()(const T &x) const {
return static_cast<T>(1.0);
}
} OnesOp;
} // namespace op::ones::cpu
#endif // __ONES_CPU_H__
#ifndef __ONES_CUDA_H__
#define __ONES_CUDA_H__
namespace op::ones::cuda {
typedef struct OnesOp {
public:
static constexpr size_t num_inputs = 1;
template <typename T>
__device__ __forceinline__ T operator()(const T &x) const {
if constexpr (std::is_same_v<T, bool>) { // 1
return true;
} else if constexpr (std::is_same_v<T, uint8_t>) { // 2
return 1;
} else if constexpr (std::is_same_v<T, int8_t>) { // 3
return 1;
} else if constexpr (std::is_same_v<T, int16_t>) { // 4
return 1;
} else if constexpr (std::is_same_v<T, int32_t>) { // 5
return 1;
} else if constexpr (std::is_same_v<T, int64_t>) { // 6
return 1;
} else if constexpr (std::is_same_v<T, uint8_t>) { // 7
return 1;
} else if constexpr (std::is_same_v<T, uint16_t>) { // 8
return 1;
} else if constexpr (std::is_same_v<T, uint32_t>) { // 9
return 1;
} else if constexpr (std::is_same_v<T, uint64_t>) { // 10
return 1;
} else if constexpr (std::is_same_v<T, cuda_fp8_e4m3>) { // 11
return cuda_fp8_e4m3(1.0f);
} else if constexpr (std::is_same_v<T, half>) { // 12
return __float2half(1.0f);
} else if constexpr (std::is_same_v<T, float>) { // 13
return 1.0f;
} else if constexpr (std::is_same_v<T, double>) { // 14
return 1.0;
} else if constexpr (std::is_same_v<T, cuda_bfloat16>) { // 19
return __float2bfloat16(1.0f);
} else {
return 1.0;
}
}
} OnesOp;
} // namespace op::ones::cuda
#endif // __ONES_CUDA_H__
#ifndef __ONES_METAX_API_H__
#define __ONES_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR(ones, metax)
#endif // __ONES_METAX_API_H__
#include "ones_metax.h"
#include "../../../elementwise/metax/elementwise_metax.h"
#include "../cuda/kernel.cuh"
namespace op::ones::metax {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &x_desc = input_desc_vec.at(0);
const auto &y_shape = out_desc->shape();
const auto &x_shape = x_desc->shape();
CHECK_DTYPE(dtype,
INFINI_DTYPE_BYTE, // 1
INFINI_DTYPE_BOOL, // 2
INFINI_DTYPE_I8, // 3
INFINI_DTYPE_I16, // 4
INFINI_DTYPE_I32, // 5
INFINI_DTYPE_I64, // 6
INFINI_DTYPE_U8, // 7
INFINI_DTYPE_U16, // 8
INFINI_DTYPE_U32, // 9
INFINI_DTYPE_U64, // 10
INFINI_DTYPE_F8, // 11
INFINI_DTYPE_F16, // 12
INFINI_DTYPE_F32, // 13
INFINI_DTYPE_F64, // 14
INFINI_DTYPE_BF16, // 19
);
CHECK_SAME_SHAPE(y_shape, x_shape);
// create CUDA elementwise descriptor
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_BYTE: // 1
return _device_info->calculate<256, cuda::OnesOp, bool>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BOOL: // 2
return _device_info->calculate<256, cuda::OnesOp, uint8_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I8: // 3
return _device_info->calculate<256, cuda::OnesOp, int8_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I16: // 4
return _device_info->calculate<256, cuda::OnesOp, int16_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I32: // 5
return _device_info->calculate<256, cuda::OnesOp, int32_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I64: // 6
return _device_info->calculate<256, cuda::OnesOp, int64_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_U8: // 7
return _device_info->calculate<256, cuda::OnesOp, uint8_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_U16: // 8
return _device_info->calculate<256, cuda::OnesOp, uint16_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_U32: // 9
return _device_info->calculate<256, cuda::OnesOp, uint32_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_U64: // 10
return _device_info->calculate<256, cuda::OnesOp, uint64_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F8: // 11
return _device_info->calculate<256, cuda::OnesOp, cuda_fp8_e4m3>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F16: // 12
return _device_info->calculate<256, cuda::OnesOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32: // 13
return _device_info->calculate<256, cuda::OnesOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64: // 14
return _device_info->calculate<256, cuda::OnesOp, double>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_C16: // 15
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_C32: // 16
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_C64: // 17
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_C128: // 18
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_BF16: // 19
return _device_info->calculate<256, cuda::OnesOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::ones::metax
#ifndef __ONES_MOORE_API_H__
#define __ONES_MOORE_API_H__
#include "../../../elementwise/moore/elementwise_moore_api.h"
ELEMENTWISE_DESCRIPTOR(ones, moore)
#endif // __ONES_MOORE_API_H__
#include "ones_moore.h"
#include "../../../elementwise/moore/elementwise_moore.h"
#include "../cuda/kernel.cuh"
namespace op::ones::moore {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &x_desc = input_desc_vec.at(0);
const auto &y_shape = out_desc->shape();
const auto &x_shape = x_desc->shape();
CHECK_DTYPE(dtype,
INFINI_DTYPE_BYTE, // 1
INFINI_DTYPE_BOOL, // 2
INFINI_DTYPE_I8, // 3
INFINI_DTYPE_I16, // 4
INFINI_DTYPE_I32, // 5
INFINI_DTYPE_I64, // 6
INFINI_DTYPE_U8, // 7
INFINI_DTYPE_U16, // 8
INFINI_DTYPE_U32, // 9
INFINI_DTYPE_U64, // 10
INFINI_DTYPE_F8, // 11
INFINI_DTYPE_F16, // 12
INFINI_DTYPE_F32, // 13
INFINI_DTYPE_F64, // 14
INFINI_DTYPE_BF16, // 19
);
CHECK_SAME_SHAPE(y_shape, x_shape);
// create MOORE elementwise descriptor
CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_BYTE: // 1
return _device_info->calculate<256, cuda::OnesOp, bool>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BOOL: // 2
return _device_info->calculate<256, cuda::OnesOp, uint8_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I8: // 3
return _device_info->calculate<256, cuda::OnesOp, int8_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I16: // 4
return _device_info->calculate<256, cuda::OnesOp, int16_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I32: // 5
return _device_info->calculate<256, cuda::OnesOp, int32_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I64: // 6
return _device_info->calculate<256, cuda::OnesOp, int64_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_U8: // 7
return _device_info->calculate<256, cuda::OnesOp, uint8_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_U16: // 8
return _device_info->calculate<256, cuda::OnesOp, uint16_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_U32: // 9
return _device_info->calculate<256, cuda::OnesOp, uint32_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_U64: // 10
return _device_info->calculate<256, cuda::OnesOp, uint64_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F8: // 11
return _device_info->calculate<256, cuda::OnesOp, cuda_fp8_e4m3>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F16: // 12
return _device_info->calculate<256, cuda::OnesOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32: // 13
return _device_info->calculate<256, cuda::OnesOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64: // 14
return _device_info->calculate<256, cuda::OnesOp, double>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_C16: // 15
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_C32: // 16
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_C64: // 17
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_C128: // 18
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_BF16: // 19
return _device_info->calculate<256, cuda::OnesOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::ones::moore
#ifndef __ONES_MOORE_KERNEL_H__
#define __ONES_MOORE_KERNEL_H__
#include <cuda_fp8.h>
namespace op::ones::cuda {
typedef struct OnesOp {
public:
static constexpr size_t num_inputs = 1;
template <typename T>
__device__ __forceinline__ T operator()(const T &x) const {
if constexpr (std::is_same_v<T, bool>) { // 1
return true;
} else if constexpr (std::is_same_v<T, uint8_t>) { // 2
return 1;
} else if constexpr (std::is_same_v<T, int8_t>) { // 3
return 1;
} else if constexpr (std::is_same_v<T, int16_t>) { // 4
return 1;
} else if constexpr (std::is_same_v<T, int32_t>) { // 5
return 1;
} else if constexpr (std::is_same_v<T, int64_t>) { // 6
return 1;
} else if constexpr (std::is_same_v<T, uint8_t>) { // 7
return 1;
} else if constexpr (std::is_same_v<T, uint16_t>) { // 8
return 1;
} else if constexpr (std::is_same_v<T, uint32_t>) { // 9
return 1;
} else if constexpr (std::is_same_v<T, uint64_t>) { // 10
return 1;
} else if constexpr (std::is_same_v<T, cuda_fp8_e4m3>) { // 11
return cuda_fp8_e4m3(1.0f);
} else if constexpr (std::is_same_v<T, half>) { // 12
return __float2half(1.0f);
} else if constexpr (std::is_same_v<T, float>) { // 13
return 1.0f;
} else if constexpr (std::is_same_v<T, double>) { // 14
return 1.0;
} else if constexpr (std::is_same_v<T, cuda_bfloat16>) { // 19
return __float2bfloat16(1.0f);
} else {
return 1.0;
}
}
} OnesOp;
} // namespace op::ones::cuda
#endif // __ONES_MOORE_KERNEL_H__
#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
#include "../cuda/kernel.cuh"
#include "ones_nvidia.cuh"
namespace op::ones::nvidia {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &x_desc = input_desc_vec.at(0);
const auto &y_shape = out_desc->shape();
const auto &x_shape = x_desc->shape();
CHECK_DTYPE(dtype,
INFINI_DTYPE_BYTE, // 1
INFINI_DTYPE_BOOL, // 2
INFINI_DTYPE_I8, // 3
INFINI_DTYPE_I16, // 4
INFINI_DTYPE_I32, // 5
INFINI_DTYPE_I64, // 6
INFINI_DTYPE_U8, // 7
INFINI_DTYPE_U16, // 8
INFINI_DTYPE_U32, // 9
INFINI_DTYPE_U64, // 10
INFINI_DTYPE_F8, // 11
INFINI_DTYPE_F16, // 12
INFINI_DTYPE_F32, // 13
INFINI_DTYPE_F64, // 14
INFINI_DTYPE_BF16, // 19
);
CHECK_SAME_SHAPE(y_shape, x_shape);
// create CUDA elementwise descriptor
CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_BYTE: // 1
return _device_info->calculate<256, cuda::OnesOp, uint8_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BOOL: // 2
return _device_info->calculate<256, cuda::OnesOp, bool>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I8: // 3
return _device_info->calculate<256, cuda::OnesOp, int8_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I16: // 4
return _device_info->calculate<256, cuda::OnesOp, int16_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I32: // 5
return _device_info->calculate<256, cuda::OnesOp, int32_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_I64: // 6
return _device_info->calculate<256, cuda::OnesOp, int64_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_U8: // 7
return _device_info->calculate<256, cuda::OnesOp, uint8_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_U16: // 8
return _device_info->calculate<256, cuda::OnesOp, uint16_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_U32: // 9
return _device_info->calculate<256, cuda::OnesOp, uint32_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_U64: // 10
return _device_info->calculate<256, cuda::OnesOp, uint64_t>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F8: // 11
return _device_info->calculate<256, cuda::OnesOp, cuda_fp8_e4m3>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F16: // 12
return _device_info->calculate<256, cuda::OnesOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32: // 13
return _device_info->calculate<256, cuda::OnesOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64: // 14
return _device_info->calculate<256, cuda::OnesOp, double>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_C16: // 15
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_C32: // 16
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_C64: // 17
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_C128: // 18
return INFINI_STATUS_NOT_IMPLEMENTED;
case INFINI_DTYPE_BF16: // 19
return _device_info->calculate<256, cuda::OnesOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::ones::nvidia
#ifndef __ONES_CUDA_API_H__
#define __ONES_CUDA_API_H__
#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
ELEMENTWISE_DESCRIPTOR(ones, nvidia)
#endif // __ONES_CUDA_API_H__
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment