Unverified Commit 93191613 authored by thatPepe's avatar thatPepe Committed by GitHub
Browse files

Merge pull request #1075 from InfiniTensor/RevertT_1-1-4

Revert T1-1-4
parents 6ab911c3 def22a08
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/equal.hpp"
namespace py = pybind11;
namespace infinicore::ops {
inline void bind_equal(py::module &m) {
m.def("equal",
&op::equal,
py::arg("a"),
py::arg("b"),
R"doc(Elementwise equality returning a bool tensor.)doc");
m.def("equal_",
&op::equal_,
py::arg("out"),
py::arg("a"),
py::arg("b"),
R"doc(In-place elementwise equality writing into `out`.)doc");
}
} // namespace infinicore::ops
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/hardswish.hpp"
namespace py = pybind11;
namespace infinicore::ops {
inline void bind_hardswish(py::module &m) {
m.def("hardswish",
&op::hardswish,
py::arg("input"),
R"doc(Out-of-place Hardswish activation.)doc");
m.def("hardswish_",
&op::hardswish_,
py::arg("output"),
py::arg("input"),
R"doc(In-place Hardswish activation.)doc");
}
} // namespace infinicore::ops
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/hardtanh.hpp"
namespace py = pybind11;
namespace infinicore::ops {
inline void bind_hardtanh(py::module &m) {
m.def("hardtanh",
&op::hardtanh,
py::arg("input"),
py::arg("min_val") = -1.0f,
py::arg("max_val") = 1.0f,
R"doc(Apply the HardTanh activation.)doc");
m.def("hardtanh_",
&op::hardtanh_,
py::arg("output"),
py::arg("input"),
py::arg("min_val") = -1.0f,
py::arg("max_val") = 1.0f,
R"doc(In-place HardTanh activation.)doc");
}
} // namespace infinicore::ops
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/sum.hpp"
namespace py = pybind11;
namespace infinicore::ops {
Tensor py_sum(Tensor input, py::object dim, bool keepdim) {
if (dim.is_none()) {
std::vector<size_t> dim_vec;
for (int i = 0; i < input->shape().size(); i++) {
dim_vec.push_back(i);
}
return op::sum(input, dim_vec, keepdim);
} else if (py::isinstance<py::tuple>(dim) || py::isinstance<py::list>(dim)) {
return op::sum(input, dim.cast<std::vector<size_t>>(), keepdim);
} else if (py::isinstance<py::int_>(dim)) {
return op::sum(input, std::vector<size_t>(1, dim.cast<size_t>()), keepdim);
} else {
throw std::invalid_argument("dim must be a tuple or an integer");
}
}
void py_sum_(Tensor output, Tensor input, py::object dim, bool keepdim) {
if (dim.is_none()) {
std::vector<size_t> dim_vec;
for (int i = 0; i < input->shape().size(); i++) {
dim_vec.push_back(i);
}
op::sum_(output, input, dim_vec, keepdim);
} else if (py::isinstance<py::tuple>(dim) || py::isinstance<py::list>(dim)) {
op::sum_(output, input, dim.cast<std::vector<size_t>>(), keepdim);
} else if (py::isinstance<py::int_>(dim)) {
op::sum_(output, input, std::vector<size_t>(1, dim.cast<size_t>()), keepdim);
} else {
throw std::invalid_argument("dim must be a tuple or an integer");
}
}
inline void bind_sum(py::module &m) {
m.def("sum",
&py_sum,
py::arg("input"),
py::arg("dim"),
py::arg("keepdim"),
R"doc(Sum of input tensor along the given dimensions.)doc");
m.def("sum_",
&py_sum_,
py::arg("output"),
py::arg("input"),
py::arg("dim"),
py::arg("keepdim"),
R"doc(In-place tensor sum.)doc");
}
} // namespace infinicore::ops
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl.h> // 添加这行
#include "infinicore/ops/topk.hpp"
namespace py = pybind11;
namespace infinicore::ops {
std::pair<Tensor, Tensor> py_topk(Tensor input, size_t k, int dim, bool largest, bool sorted) {
if (dim == -1) {
return op::topk(input, k, input->ndim() - 1, largest, sorted);
} else if (dim >= 0) {
return op::topk(input, k, static_cast<size_t>(dim), largest, sorted);
} else {
throw std::invalid_argument("invalid argument: dim");
}
}
void py_topk_(Tensor values_output, Tensor indices_output, Tensor input, size_t k, int dim, bool largest, bool sorted) {
if (dim == -1) {
op::topk_(values_output, indices_output, input, k, input->ndim() - 1, largest, sorted);
} else if (dim >= 0) {
op::topk_(values_output, indices_output, input, k, static_cast<size_t>(dim), largest, sorted);
} else {
throw std::invalid_argument("invalid argument: dim");
}
}
inline void bind_topk(py::module &m) {
m.def("topk",
&py_topk,
py::arg("input"),
py::arg("k"),
py::arg("dim"),
py::arg("largest"),
py::arg("sorted"),
R"doc(topk of input tensor along the given dimensions.)doc");
m.def("topk_",
&py_topk_,
py::arg("values_output"),
py::arg("indices_output"),
py::arg("input"),
py::arg("k"),
py::arg("dim"),
py::arg("largest"),
py::arg("sorted"),
R"doc(In-place tensor topk_.)doc");
}
} // namespace infinicore::ops
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/var.hpp"
namespace py = pybind11;
namespace infinicore::ops {
Tensor py_var(Tensor input, py::object dim, bool unbiased, bool keepdim) {
if (dim.is_none()) {
std::vector<size_t> dim_vec;
for (int i = 0; i < input->shape().size(); i++) {
dim_vec.push_back(i);
}
return op::var(input, dim_vec, unbiased, keepdim);
} else if (py::isinstance<py::tuple>(dim) || py::isinstance<py::list>(dim)) {
return op::var(input, dim.cast<std::vector<size_t>>(), unbiased, keepdim);
} else if (py::isinstance<py::int_>(dim)) {
return op::var(input, std::vector<size_t>(1, dim.cast<size_t>()), unbiased, keepdim);
} else {
throw std::invalid_argument("dim must be a tuple or an integer");
}
}
void py_var_(Tensor var_output, Tensor input, py::object dim, bool unbiased, bool keepdim) {
if (dim.is_none()) {
std::vector<size_t> dim_vec;
for (int i = 0; i < input->shape().size(); i++) {
dim_vec.push_back(i);
}
op::var_(var_output, input, dim_vec, unbiased, keepdim);
} else if (py::isinstance<py::tuple>(dim) || py::isinstance<py::list>(dim)) {
op::var_(var_output, input, dim.cast<std::vector<size_t>>(), unbiased, keepdim);
} else if (py::isinstance<py::int_>(dim)) {
op::var_(var_output, input, std::vector<size_t>(1, dim.cast<size_t>()), unbiased, keepdim);
} else {
throw std::invalid_argument("dim must be a list/tuple or an integer");
}
}
inline void bind_var(py::module &m) {
m.def("var",
&py_var,
py::arg("input"),
py::arg("dim"),
py::arg("unbiased"),
py::arg("keepdim"),
R"doc(Var of input tensor along the given dimensions.)doc");
m.def("var_",
&py_var_,
py::arg("var_output"),
py::arg("input"),
py::arg("dim"),
py::arg("unbiased"),
py::arg("keepdim"),
R"doc(In-place tensor Var .)doc");
}
} // namespace infinicore::ops
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/var_mean.hpp"
namespace py = pybind11;
namespace infinicore::ops {
std::pair<Tensor, Tensor> py_var_mean(Tensor input, py::object dim, bool unbiased, bool keepdim) {
if (dim.is_none()) {
std::vector<size_t> dim_vec;
for (int i = 0; i < input->shape().size(); i++) {
dim_vec.push_back(i);
}
return op::var_mean(input, dim_vec, unbiased, keepdim);
} else if (py::isinstance<py::tuple>(dim) || py::isinstance<py::list>(dim)) {
return op::var_mean(input, dim.cast<std::vector<size_t>>(), unbiased, keepdim);
} else if (py::isinstance<py::int_>(dim)) {
return op::var_mean(input, std::vector<size_t>(1, dim.cast<size_t>()), unbiased, keepdim);
} else {
throw std::invalid_argument("dim must be a tuple or an integer");
}
}
void py_var_mean_(Tensor var_output, Tensor mean_output, Tensor input, py::object dim, bool unbiased, bool keepdim) {
if (dim.is_none()) {
std::vector<size_t> dim_vec;
for (int i = 0; i < input->shape().size(); i++) {
dim_vec.push_back(i);
}
op::var_mean_(var_output, mean_output, input, dim_vec, unbiased, keepdim);
} else if (py::isinstance<py::tuple>(dim) || py::isinstance<py::list>(dim)) {
op::var_mean_(var_output, mean_output, input, dim.cast<std::vector<size_t>>(), unbiased, keepdim);
} else if (py::isinstance<py::int_>(dim)) {
op::var_mean_(var_output, mean_output, input, std::vector<size_t>(1, dim.cast<size_t>()), unbiased, keepdim);
} else {
throw std::invalid_argument("dim must be a list/tuple or an integer");
}
}
inline void bind_var_mean(py::module &m) {
m.def("var_mean",
&py_var_mean,
py::arg("input"),
py::arg("dim"),
py::arg("unbiased"),
py::arg("keepdim"),
R"doc(Var & Mean of input tensor along the given dimensions.)doc");
m.def("var_mean_",
&py_var_mean_,
py::arg("var_output"),
py::arg("mean_output"),
py::arg("input"),
py::arg("dim"),
py::arg("unbiased"),
py::arg("keepdim"),
R"doc(In-place tensor Var & Mean .)doc");
}
} // namespace infinicore::ops
#ifndef INFINIOP_ALL_DESCRIPTOR_H_
#define INFINIOP_ALL_DESCRIPTOR_H_
#include "../../../utils.h"
#include "../../operator.h"
#include "../../tensor.h"
#include "info.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::all::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
AllInfo _info; \
size_t _workspace_size; \
\
Descriptor( \
Opaque *opaque, \
AllInfo info, \
size_t workspace_size, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size) {} \
\
public: \
~Descriptor(); \
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t output_desc, \
infiniopTensorDescriptor_t input_desc, \
size_t *dim, \
size_t dim_size, \
bool keepdim); \
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
void *output, \
const void *input, \
size_t *dim, \
size_t dim_size, \
bool keepdim, \
void *stream) const; \
}; \
}
#endif
#include "all_cpu.h"
#include "../../../../utils.h"
#include "../../../devices/cpu/common_cpu.h"
#include <iostream>
namespace op::all::cpu {
Descriptor::~Descriptor() {}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
size_t *dim,
size_t dim_size,
bool keepdim) {
auto result = AllInfo::create(output_desc, input_desc, dim, dim_size, keepdim);
CHECK_RESULT(result);
*desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
namespace {
template <typename Tdata>
infiniStatus_t calculateAll(
const AllInfo &info,
bool *output,
const Tdata *input,
size_t *dim,
size_t dim_size,
bool keepdim) {
if (info.reduce_dim_size == info.ndim) {
bool result = true;
for (size_t index = 0; index < info.input_size; index++) {
size_t input_offset = op::common_cpu::indexToOffset(index, info.ndim, info.permuted_input_shape.data(), info.permuted_input_strides.data());
result = result && input[input_offset];
}
output[0] = result;
return INFINI_STATUS_SUCCESS;
} else {
for (size_t i = info.output_size; i-- > 0;) {
size_t output_offset = op::common_cpu::indexToOffset(i, info.output_shape.size(), info.output_shape.data(), info.output_strides.data());
bool result = true;
for (size_t j = 0; j < info.reduce_num; j++) {
size_t input_flat = j + i * info.reduce_num;
size_t input_offset = op::common_cpu::indexToOffset(input_flat, info.ndim, info.permuted_input_shape.data(), info.permuted_input_strides.data());
Tdata input_val = input[input_offset];
bool bool_val = static_cast<bool>(input_val);
result = result && bool_val;
}
output[output_offset] = result;
}
return INFINI_STATUS_SUCCESS;
}
}
} // namespace
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
const void *input,
size_t *dim,
size_t dim_size,
bool keepdim,
void *stream) const {
switch (_info.dtype) {
case INFINI_DTYPE_BOOL:
return calculateAll<bool>(_info, reinterpret_cast<bool *>(output), reinterpret_cast<const bool *>(input), dim, dim_size, keepdim);
case INFINI_DTYPE_U8:
return calculateAll<uint8_t>(_info, reinterpret_cast<bool *>(output), reinterpret_cast<const uint8_t *>(input), dim, dim_size, keepdim);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::all::cpu
#ifndef __INFINIOP_ALL_CPU_H__
#define __INFINIOP_ALL_CPU_H__
#include "../all_desc.h"
DESCRIPTOR(cpu);
#endif // __INFINIOP_ALL_CPU_H__
#ifndef __ALL_CUDA_H__
#define __ALL_CUDA_H__
__forceinline__ __device__ __host__ size_t
indexToOffset(
size_t flat_index,
size_t ndim,
const size_t *shape,
const ptrdiff_t *strides) {
size_t res = 0;
for (size_t i = ndim; i-- > 0;) {
res += (flat_index % shape[i]) * strides[i];
flat_index /= shape[i];
}
return res;
}
template <size_t BLOCK_SIZE, typename Tdata>
__global__ void allReduceTempKernel(
bool *temp_output,
const Tdata *input,
size_t input_size,
size_t permuted_input_shape_size,
size_t *permuted_input_shape,
ptrdiff_t *permuted_input_strides) {
__shared__ bool s_data[BLOCK_SIZE];
size_t tid = threadIdx.x;
size_t idx = tid + blockIdx.x * blockDim.x;
if (idx < input_size) {
size_t input_offset = indexToOffset(idx, permuted_input_shape_size, permuted_input_shape, permuted_input_strides);
s_data[tid] = static_cast<bool>(input[input_offset]);
} else {
s_data[tid] = true;
}
__syncthreads();
for (size_t s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) {
s_data[tid] = s_data[tid] && s_data[tid + s];
}
__syncthreads();
}
if (tid == 0) {
temp_output[blockIdx.x] = s_data[0];
}
}
template <size_t BLOCK_SIZE>
__global__ void finalAllReduceKernel(
bool *output,
const bool *block_results,
size_t num_blocks) {
__shared__ bool s_data[BLOCK_SIZE];
size_t tid = threadIdx.x;
bool thread_val = true;
for (size_t i = tid; i < num_blocks; i += blockDim.x) {
thread_val = thread_val && block_results[i];
}
s_data[tid] = thread_val;
__syncthreads();
for (size_t s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
if (tid < s) {
s_data[tid] = s_data[tid] && s_data[tid + s];
}
__syncthreads();
}
if (tid == 0) {
*output = s_data[0];
}
}
template <size_t BLOCK_SIZE, typename Tdata>
__global__ void allKernel(
bool *output,
const Tdata *input,
size_t permuted_input_shape_size,
size_t output_shape_size,
size_t output_size,
size_t reduce_num,
size_t *permuted_input_shape,
size_t *output_shape,
ptrdiff_t *permuted_input_strides,
ptrdiff_t *output_strides) {
size_t tid = threadIdx.x;
size_t idx = tid + blockIdx.x * blockDim.x;
if (idx >= output_size) {
return;
}
size_t output_index = indexToOffset(idx, output_shape_size, output_shape, output_strides);
bool tempRes = true;
for (size_t i = 0; i < reduce_num; i++) {
size_t input_offset = indexToOffset(i + idx * reduce_num, permuted_input_shape_size, permuted_input_shape, permuted_input_strides);
tempRes = tempRes && static_cast<bool>(input[input_offset]);
}
output[output_index] = tempRes;
}
#endif // __ALL_CUDA_H__
#ifndef __ALL_INFO_H__
#define __ALL_INFO_H__
#include "../../../utils.h"
#include "../../tensor.h"
#include <algorithm>
#include <cstddef>
#include <vector>
namespace op::all {
class AllInfo {
AllInfo() = default;
public:
infiniDtype_t dtype;
std::vector<size_t> permuted_input_shape; // need to permute
std::vector<size_t> output_shape;
std::vector<ptrdiff_t> permuted_input_strides; // need to permute
std::vector<ptrdiff_t> output_strides;
size_t reduce_dim_size; // reduce dim size
size_t reduce_num; // number of elements to reduce for each output element
size_t input_size; // total number of input elements
size_t output_size; // total number of output elements
size_t ndim; // number of dimensions
static utils::Result<AllInfo> create(
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
size_t *dim,
size_t dim_size,
bool keepdim) {
auto input_shape = input_desc->shape();
auto input_strides = input_desc->strides();
size_t input_ndim = input_desc->ndim();
size_t reduce_num = 1;
for (size_t i = 0; i < dim_size; i++) {
reduce_num *= input_shape[dim[i]];
}
std::vector<size_t> permute_order;
for (size_t i = 0; i < input_ndim; i++) {
if (std::find(dim, dim + dim_size, i) == dim + dim_size) {
permute_order.push_back(i);
}
}
for (size_t i = 0; i < dim_size; i++) {
permute_order.push_back(dim[i]);
}
std::vector<size_t> permuted_input_shape;
std::vector<ptrdiff_t> permuted_input_strides;
for (size_t i = 0; i < permute_order.size(); i++) {
permuted_input_shape.push_back(input_shape[permute_order[i]]);
permuted_input_strides.push_back(input_strides[permute_order[i]]);
}
return utils::Result<AllInfo>(AllInfo{input_desc->dtype(),
permuted_input_shape,
output_desc->shape(),
permuted_input_strides,
output_desc->strides(),
dim_size,
reduce_num,
input_desc->numel(),
output_desc->numel(),
input_ndim});
}
};
} // namespace op::all
#endif
#ifndef __ALL_METAX_H__
#define __ALL_METAX_H__
#include "../all_desc.h"
DESCRIPTOR(metax);
#endif
#include "../../../devices/metax/metax_common.h"
#include "../../../devices/metax/metax_kernel_common.h"
#include "../cuda/kernel.cuh"
#include "all_metax.h"
namespace op::all::metax {
struct Descriptor::Opaque {
std::shared_ptr<device::metax::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
size_t *dim,
size_t dim_size,
bool keepdim) {
auto result = AllInfo::create(output_desc, input_desc, dim, dim_size, keepdim);
CHECK_RESULT(result);
auto info = result.take();
size_t workspace_size = 0;
workspace_size += (input_desc->ndim() + output_desc->ndim()) * (sizeof(size_t) + sizeof(ptrdiff_t));
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
info, workspace_size, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
namespace {
template <size_t BLOCK_SIZE, typename Tdata>
infiniStatus_t launchKernel(
const AllInfo &info,
bool *output, const Tdata *input,
hcStream_t stream, void *workspace, size_t workspace_size) {
size_t input_ndim = info.permuted_input_shape.size();
size_t output_ndim = info.output_shape.size();
size_t input_size = info.input_size;
size_t output_size = info.output_size;
size_t reduce_num = info.reduce_num;
unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
size_t workspace_offset = 0;
size_t *permuted_input_shape_hc = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
size_t *output_shape_hc = permuted_input_shape_hc + input_ndim;
workspace_offset += (input_ndim + output_ndim) * sizeof(size_t);
ptrdiff_t *permuted_input_strides_hc = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
ptrdiff_t *output_strides_hc = permuted_input_strides_hc + input_ndim;
workspace_offset += (input_ndim + output_ndim) * sizeof(ptrdiff_t);
CHECK_METAX(hcMemcpyAsync(permuted_input_shape_hc, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream));
CHECK_METAX(hcMemcpyAsync(output_shape_hc, info.output_shape.data(), output_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream));
CHECK_METAX(hcMemcpyAsync(permuted_input_strides_hc, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream));
CHECK_METAX(hcMemcpyAsync(output_strides_hc, info.output_strides.data(), output_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream));
if (info.reduce_num == input_size) {
size_t grid_size = (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
bool *temp_output;
CHECK_METAX(hcMalloc(&temp_output, grid_size * sizeof(bool)));
allReduceTempKernel<BLOCK_SIZE, Tdata><<<grid_size, BLOCK_SIZE, BLOCK_SIZE * sizeof(bool), stream>>>(
temp_output, input, input_size, input_ndim, permuted_input_shape_hc, permuted_input_strides_hc);
finalAllReduceKernel<BLOCK_SIZE><<<1, BLOCK_SIZE>>>(output, temp_output, grid_size);
CHECK_METAX(hcFree(temp_output));
} else {
size_t grid_size = (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
allKernel<BLOCK_SIZE, Tdata><<<grid_size, BLOCK_SIZE, 0, stream>>>(
output, input, input_ndim, output_ndim, output_size, reduce_num,
permuted_input_shape_hc, output_shape_hc, permuted_input_strides_hc, output_strides_hc);
}
return INFINI_STATUS_SUCCESS;
}
} // namespace
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
const void *input,
size_t *dim,
size_t dim_size,
bool keepdim,
void *stream_) const {
hcStream_t stream = (hcStream_t)stream_;
#define CALCULATE_ALL(BLOCK_SIZE, Tdata) \
launchKernel<BLOCK_SIZE, Tdata>( \
_info, \
(bool *)output, (const Tdata *)input, \
stream, workspace, workspace_size)
#define CALCULATE_ALL_WITH_BLOCK_SIZE(BLOCK_SIZE) \
{ \
if (_info.dtype == INFINI_DTYPE_BOOL) \
return CALCULATE_ALL(BLOCK_SIZE, bool); \
else if (_info.dtype == INFINI_DTYPE_U8) \
return CALCULATE_ALL(BLOCK_SIZE, uint8_t); \
else \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
if (_opaque->internal->maxThreadsPerBlock() >= 256) {
CALCULATE_ALL_WITH_BLOCK_SIZE(256)
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::all::metax
#ifndef __ALL_MOORE_H__
#define __ALL_MOORE_H__
#include "../all_desc.h"
DESCRIPTOR(moore);
#endif
#include "../../../devices/moore/moore_common.h"
#include "../../../devices/moore/moore_kernel_common.h"
#include "../cuda/kernel.cuh"
#include "all_moore.h"
namespace op::all::moore {
struct Descriptor::Opaque {
std::shared_ptr<device::moore::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
size_t *dim,
size_t dim_size,
bool keepdim) {
auto result = AllInfo::create(output_desc, input_desc, dim, dim_size, keepdim);
CHECK_RESULT(result);
auto info = result.take();
size_t workspace_size = 0;
workspace_size += (input_desc->ndim() + output_desc->ndim()) * (sizeof(size_t) + sizeof(ptrdiff_t));
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
info, workspace_size, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
namespace {
template <size_t BLOCK_SIZE, typename Tdata>
infiniStatus_t launchKernel(
const AllInfo &info,
bool *output, const Tdata *input,
musaStream_t stream, void *workspace, size_t workspace_size) {
size_t input_ndim = info.permuted_input_shape.size();
size_t output_ndim = info.output_shape.size();
size_t input_size = info.input_size;
size_t output_size = info.output_size;
size_t reduce_num = info.reduce_num;
unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
size_t workspace_offset = 0;
size_t *permuted_input_shape_musa = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
size_t *output_shape_musa = permuted_input_shape_musa + input_ndim;
workspace_offset += (input_ndim + output_ndim) * sizeof(size_t);
ptrdiff_t *permuted_input_strides_musa = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
ptrdiff_t *output_strides_musa = permuted_input_strides_musa + input_ndim;
workspace_offset += (input_ndim + output_ndim) * sizeof(ptrdiff_t);
CHECK_MOORE(musaMemcpyAsync(permuted_input_shape_musa, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), musaMemcpyHostToDevice, stream));
CHECK_MOORE(musaMemcpyAsync(output_shape_musa, info.output_shape.data(), output_ndim * sizeof(size_t), musaMemcpyHostToDevice, stream));
CHECK_MOORE(musaMemcpyAsync(permuted_input_strides_musa, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream));
CHECK_MOORE(musaMemcpyAsync(output_strides_musa, info.output_strides.data(), output_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream));
if (info.reduce_num == input_size) {
size_t grid_size = (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
bool *temp_output;
CHECK_MOORE(musaMalloc(&temp_output, grid_size * sizeof(bool)));
allReduceTempKernel<BLOCK_SIZE, Tdata><<<grid_size, BLOCK_SIZE, BLOCK_SIZE * sizeof(bool), stream>>>(
temp_output, input, input_size, input_ndim, permuted_input_shape_musa, permuted_input_strides_musa);
finalAllReduceKernel<BLOCK_SIZE><<<1, BLOCK_SIZE>>>(output, temp_output, grid_size);
CHECK_MOORE(musaFree(temp_output));
} else {
size_t grid_size = (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
allKernel<BLOCK_SIZE, Tdata><<<grid_size, BLOCK_SIZE, 0, stream>>>(
output, input, input_ndim, output_ndim, output_size, reduce_num,
permuted_input_shape_musa, output_shape_musa, permuted_input_strides_musa, output_strides_musa);
}
return INFINI_STATUS_SUCCESS;
}
} // namespace
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
const void *input,
size_t *dim,
size_t dim_size,
bool keepdim,
void *stream_) const {
musaStream_t stream = (musaStream_t)stream_;
#define CALCULATE_ALL(BLOCK_SIZE, Tdata) \
launchKernel<BLOCK_SIZE, Tdata>( \
_info, \
(bool *)output, (const Tdata *)input, \
stream, workspace, workspace_size)
#define CALCULATE_ALL_WITH_BLOCK_SIZE(BLOCK_SIZE) \
{ \
if (_info.dtype == INFINI_DTYPE_BOOL) \
return CALCULATE_ALL(BLOCK_SIZE, bool); \
else if (_info.dtype == INFINI_DTYPE_U8) \
return CALCULATE_ALL(BLOCK_SIZE, uint8_t); \
else \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
if (_opaque->internal->maxThreadsPerBlock() >= 256) {
CALCULATE_ALL_WITH_BLOCK_SIZE(256)
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::all::moore
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include "../cuda/kernel.cuh"
#include "all_nvidia.cuh"
namespace op::all::nvidia {
struct Descriptor::Opaque {
std::shared_ptr<device::nvidia::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
size_t *dim,
size_t dim_size,
bool keepdim) {
auto result = AllInfo::create(output_desc, input_desc, dim, dim_size, keepdim);
CHECK_RESULT(result);
auto info = result.take();
size_t workspace_size = 0;
workspace_size += (input_desc->ndim() + output_desc->ndim()) * (sizeof(size_t) + sizeof(ptrdiff_t));
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
info, workspace_size, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
namespace {
template <size_t BLOCK_SIZE, typename Tdata>
infiniStatus_t launchKernel(
const AllInfo &info,
bool *output, const Tdata *input,
cudaStream_t stream, void *workspace, size_t workspace_size) {
size_t input_ndim = info.permuted_input_shape.size();
size_t output_ndim = info.output_shape.size();
size_t input_size = info.input_size;
size_t output_size = info.output_size;
size_t reduce_num = info.reduce_num;
unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
size_t workspace_offset = 0;
size_t *permuted_input_shape_cuda = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
size_t *output_shape_cuda = permuted_input_shape_cuda + input_ndim;
workspace_offset += (input_ndim + output_ndim) * sizeof(size_t);
ptrdiff_t *permuted_input_strides_cuda = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
ptrdiff_t *output_strides_cuda = permuted_input_strides_cuda + input_ndim;
workspace_offset += (input_ndim + output_ndim) * sizeof(ptrdiff_t);
CHECK_CUDA(cudaMemcpyAsync(permuted_input_shape_cuda, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream));
CHECK_CUDA(cudaMemcpyAsync(output_shape_cuda, info.output_shape.data(), output_ndim * sizeof(size_t), cudaMemcpyHostToDevice, stream));
CHECK_CUDA(cudaMemcpyAsync(permuted_input_strides_cuda, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream));
CHECK_CUDA(cudaMemcpyAsync(output_strides_cuda, info.output_strides.data(), output_ndim * sizeof(ptrdiff_t), cudaMemcpyHostToDevice, stream));
if (info.reduce_num == input_size) {
size_t grid_size = (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
bool *temp_output;
CHECK_CUDA(cudaMalloc(&temp_output, grid_size * sizeof(bool)));
allReduceTempKernel<BLOCK_SIZE, Tdata><<<grid_size, BLOCK_SIZE, BLOCK_SIZE * sizeof(bool), stream>>>(
temp_output, input, input_size, input_ndim, permuted_input_shape_cuda, permuted_input_strides_cuda);
finalAllReduceKernel<BLOCK_SIZE><<<1, BLOCK_SIZE>>>(output, temp_output, grid_size);
CHECK_CUDA(cudaFree(temp_output));
} else {
size_t grid_size = (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
allKernel<BLOCK_SIZE, Tdata><<<grid_size, BLOCK_SIZE, 0, stream>>>(
output, input, input_ndim, output_ndim, output_size, reduce_num,
permuted_input_shape_cuda, output_shape_cuda, permuted_input_strides_cuda, output_strides_cuda);
}
return INFINI_STATUS_SUCCESS;
}
} // namespace
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
const void *input,
size_t *dim,
size_t dim_size,
bool keepdim,
void *stream_) const {
cudaStream_t stream = (cudaStream_t)stream_;
#define CALCULATE_ALL(BLOCK_SIZE, Tdata) \
launchKernel<BLOCK_SIZE, Tdata>( \
_info, \
(bool *)output, (const Tdata *)input, \
stream, workspace, workspace_size)
#define CALCULATE_ALL_WITH_BLOCK_SIZE(BLOCK_SIZE) \
{ \
if (_info.dtype == INFINI_DTYPE_BOOL) \
return CALCULATE_ALL(BLOCK_SIZE, bool); \
else if (_info.dtype == INFINI_DTYPE_U8) \
return CALCULATE_ALL(BLOCK_SIZE, uint8_t); \
else \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
if (_opaque->internal->maxThreadsPerBlock() >= 256) {
CALCULATE_ALL_WITH_BLOCK_SIZE(256)
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::all::nvidia
#ifndef __ALL_NVIDIA_H__
#define __ALL_NVIDIA_H__
#include "../all_desc.h"
DESCRIPTOR(nvidia);
#endif // __ALL_CUDA_API_H__
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/all.h"
#include <vector>
#ifdef ENABLE_CPU_API
#include "cpu/all_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
#include "nvidia/all_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
#include "metax/all_metax.h"
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/all_kunlun.h"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/all_moore.h"
#endif
__INFINI_C infiniStatus_t infiniopCreateAllDescriptor(
infiniopHandle_t handle,
infiniopAllDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
size_t *dim,
size_t dim_size,
bool keepdim) {
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::all::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::all::NAMESPACE::Descriptor **>(desc_ptr), \
output_desc, \
input_desc, \
dim, \
dim_size, \
keepdim)
switch (handle->device) {
#ifdef ENABLE_CPU_API
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
CREATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_MOORE_API
CREATE(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CREATE
}
__INFINI_C infiniStatus_t infiniopGetAllWorkspaceSize(infiniopAllDescriptor_t desc, size_t *size) {
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::all::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
GET(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
GET(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_MOORE_API
GET(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef GET
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
__INFINI_C infiniStatus_t infiniopAll(
infiniopAllDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *output,
const void *input,
size_t *dim,
size_t dim_size,
bool keepdim,
void *stream) {
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::all::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, output, input, dim, dim_size, keepdim, stream)
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
CALCULATE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef CALCULATE
}
__INFINI_C infiniStatus_t
infiniopDestroyAllDescriptor(infiniopAllDescriptor_t desc) {
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::all::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch (desc->device_type) {
#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
#endif
#ifdef ENABLE_QY_API
DELETE(INFINI_DEVICE_QY, nvidia);
#endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, metax);
#endif
#ifdef ENABLE_KUNLUN_API
DELETE(INFINI_DEVICE_KUNLUN, kunlun);
#endif
#ifdef ENABLE_MOORE_API
DELETE(INFINI_DEVICE_MOORE, moore);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
}
#undef DELETE
}
#ifndef __AVG_POOL1D_H__
#define __AVG_POOL1D_H__
#include "../../../utils.h"
#include "../../operator.h"
#include "../../tensor.h"
#include "infiniop/ops/avg_pool1d.h"
#define DESCRIPTOR(NAMESPACE) \
namespace op::avg_pool1d::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
AvgPool1dInfo _info; \
size_t _workspace_size; \
\
Descriptor( \
AvgPool1dInfo info, \
size_t workspace_size_, \
Opaque *opaque, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size_) {} \
\
public: \
~Descriptor(); \
\
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t y_desc, \
infiniopTensorDescriptor_t x_desc, \
size_t kernel_size, \
size_t stride, \
size_t padding); \
\
infiniStatus_t calculate( \
void *workspace, \
size_t workspace_size, \
void *y, \
const void *x, \
void *stream) const; \
}; \
}
class AvgPool1dInfo {
private:
AvgPool1dInfo() = default;
public:
infiniDtype_t dtype;
size_t batch, channels, in_width, out_width;
size_t kernel_size, stride, padding;
ptrdiff_t y_stride_batch, y_stride_channel, y_stride_width;
ptrdiff_t x_stride_batch, x_stride_channel, x_stride_width;
static utils::Result<AvgPool1dInfo> createAvgPool1dInfo(
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
size_t kernel_size,
size_t stride,
size_t padding) {
CHECK_OR_RETURN(y_desc != nullptr && x_desc != nullptr, INFINI_STATUS_NULL_POINTER);
const infiniDtype_t dtype = y_desc->dtype();
CHECK_OR_RETURN(dtype == x_desc->dtype(), INFINI_STATUS_BAD_TENSOR_DTYPE);
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_OR_RETURN(y_desc->ndim() == 3 && x_desc->ndim() == 3, INFINI_STATUS_BAD_TENSOR_SHAPE);
size_t batch = x_desc->dim(0);
size_t channels = x_desc->dim(1);
size_t in_width = x_desc->dim(2);
CHECK_OR_RETURN(y_desc->dim(0) == batch, INFINI_STATUS_BAD_TENSOR_SHAPE);
CHECK_OR_RETURN(y_desc->dim(1) == channels, INFINI_STATUS_BAD_TENSOR_SHAPE);
size_t padded_len = in_width + 2 * padding;
CHECK_OR_RETURN(padded_len >= kernel_size, INFINI_STATUS_BAD_TENSOR_SHAPE);
size_t expected_out_width = (padded_len - kernel_size) / stride + 1;
CHECK_OR_RETURN(y_desc->dim(2) == expected_out_width, INFINI_STATUS_BAD_TENSOR_SHAPE);
size_t out_width = expected_out_width;
return utils::Result<AvgPool1dInfo>(AvgPool1dInfo{
dtype,
batch, channels, in_width, out_width,
kernel_size, stride, padding,
y_desc->stride(0), y_desc->stride(1), y_desc->stride(2),
x_desc->stride(0), x_desc->stride(1), x_desc->stride(2)});
}
};
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment