"vscode:/vscode.git/clone" did not exist on "0fd839682765307c49f9aa67b5c168e3d30bc356"
Commit 210e31d3 authored by PanZezhong's avatar PanZezhong
Browse files

issue/1031 T1-1-4

parent 7f295448
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/sum.hpp"
#include <infiniop.h>
namespace infinicore::op::sum_impl::infiniop {
thread_local common::OpCache<size_t, infiniopSumDescriptor_t> caches(
100, // capacity
[](infiniopSumDescriptor_t &desc) {
if (desc != nullptr) {
INFINICORE_CHECK_ERROR(infiniopDestroySumDescriptor(desc));
desc = nullptr;
}
});
void calculate(Tensor output, Tensor input, std::vector<size_t> dim, bool keepdim) {
size_t seed = hash_combine(output, input, dim.size(), keepdim);
auto device_type = context::getDevice().getType();
auto device_index = context::getDevice().getIndex();
auto &cache = caches.getCache(device_type, device_index);
auto desc_opt = cache.get(seed);
infiniopSumDescriptor_t desc = nullptr;
if (!desc_opt) {
INFINICORE_CHECK_ERROR(infiniopCreateSumDescriptor(
context::getInfiniopHandle(output->device()), &desc,
output->desc(), input->desc(), dim.data(), dim.size(), keepdim));
cache.put(seed, desc);
} else {
desc = *desc_opt;
}
size_t workspace_size = 0;
INFINICORE_CHECK_ERROR(infiniopGetSumWorkspaceSize(desc, &workspace_size));
std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
INFINICORE_CHECK_ERROR(infiniopSum(
desc, workspace->data(), workspace_size,
output->data(), input->data(), dim.data(), dim.size(), keepdim, context::getStream()));
}
static bool registered = []() {
Sum::dispatcher().registerDevice({Device::Type::CPU,
Device::Type::NVIDIA,
Device::Type::METAX,
Device::Type::MOORE,
Device::Type::ILUVATAR},
&calculate, false);
return true;
}();
} // namespace infinicore::op::sum_impl::infiniop
#include "infinicore/ops/topk.hpp"
#include "../../utils.hpp"
#include <stdexcept>
#include <vector>
namespace infinicore::op {
common::OpDispatcher<TopK::schema> &TopK::dispatcher() {
static common::OpDispatcher<TopK::schema> dispatcher_;
return dispatcher_;
};
void TopK::execute(Tensor values_output, Tensor indices_output, Tensor input, size_t k, size_t dim, bool largest, bool sorted) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(values_output, input);
infinicore::context::setDevice(input->device());
auto device_type = context::getDevice().getType();
auto func = dispatcher().lookup(device_type);
if (func == nullptr) {
throw std::runtime_error("No Topk implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
}
func(values_output, indices_output, input, k, dim, largest, sorted);
}
std::pair<Tensor, Tensor> topk(Tensor input, size_t k, size_t dim, bool largest, bool sorted) {
auto in_shape = input->shape();
std::vector<size_t> out_shape = in_shape;
out_shape[dim] = k;
auto values_output = Tensor::empty(out_shape, input->dtype(), input->device());
auto indices_output = Tensor::empty(out_shape, DataType::I32, input->device());
topk_(values_output, indices_output, input, k, dim, largest, sorted);
return {values_output, indices_output};
}
void topk_(Tensor values_output, Tensor indices_output, Tensor input, size_t k, size_t dim, bool largest, bool sorted) {
TopK::execute(values_output, indices_output, input, k, dim, largest, sorted);
}
} // namespace infinicore::op
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/topk.hpp"
#include <infiniop.h>
namespace infinicore::op::topk_impl::infiniop {
thread_local common::OpCache<size_t, infiniopTopKDescriptor_t> caches(
100, // capacity
[](infiniopTopKDescriptor_t &desc) {
if (desc != nullptr) {
INFINICORE_CHECK_ERROR(infiniopDestroyTopKDescriptor(desc));
desc = nullptr;
}
});
void calculate(Tensor values_output, Tensor indices_output, Tensor input, size_t k, size_t dim, bool largest, bool sorted) {
size_t seed = hash_combine(values_output, indices_output, input, k, dim, largest, sorted);
auto device_type = context::getDevice().getType();
auto device_index = context::getDevice().getIndex();
auto &cache = caches.getCache(device_type, device_index);
auto desc_opt = cache.get(seed);
infiniopTopKDescriptor_t desc = nullptr;
if (!desc_opt) {
INFINICORE_CHECK_ERROR(infiniopCreateTopKDescriptor(
context::getInfiniopHandle(values_output->device()), &desc,
values_output->desc(), indices_output->desc(), input->desc(), k, dim, largest, sorted));
cache.put(seed, desc);
} else {
desc = *desc_opt;
}
size_t workspace_size = 0;
INFINICORE_CHECK_ERROR(infiniopGetTopKWorkspaceSize(desc, &workspace_size));
std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
INFINICORE_CHECK_ERROR(infiniopTopK(
desc, workspace->data(), workspace_size,
values_output->data(), indices_output->data(), input->data(), k, dim, largest, sorted, context::getStream()));
}
static bool registered = []() {
TopK::dispatcher().registerDevice({Device::Type::CPU,
Device::Type::NVIDIA,
Device::Type::METAX,
Device::Type::MOORE,
Device::Type::ILUVATAR},
&calculate, false);
return true;
}();
} // namespace infinicore::op::topk_impl::infiniop
#include "infinicore/ops/var.hpp"
#include "../../utils.hpp"
#include <stdexcept>
#include <vector>
namespace infinicore::op {
common::OpDispatcher<Var::schema> &Var::dispatcher() {
static common::OpDispatcher<Var::schema> dispatcher_;
return dispatcher_;
};
void Var::execute(Tensor var_output, Tensor input, std::vector<size_t> dim, bool unbiased, bool keepdim) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(var_output, input);
infinicore::context::setDevice(input->device());
auto device_type = context::getDevice().getType();
auto func = dispatcher().lookup(device_type);
if (func == nullptr) {
throw std::runtime_error("No Var implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
}
func(var_output, input, dim, unbiased, keepdim);
}
Tensor var(Tensor input, std::vector<size_t> dim, bool unbiased, bool keepdim) {
auto in_shape = input->shape();
std::vector<size_t> out_shape;
if (dim.empty()) {
for (size_t i = 0; i < in_shape.size(); i++) {
dim.push_back(i);
}
}
std::sort(dim.begin(), dim.end());
if (dim.size() == in_shape.size() && !keepdim) {
out_shape = {};
} else {
if (keepdim) {
size_t j = 0;
for (size_t i = 0; i < in_shape.size(); i++) {
if (j < dim.size() && dim[j] == i) {
out_shape.push_back(1);
j++;
} else {
out_shape.push_back(in_shape[i]);
}
}
} else {
size_t j = 0;
for (size_t i = 0; i < in_shape.size(); i++) {
if (j < dim.size() && dim[j] == i) {
j++;
} else {
out_shape.push_back(in_shape[i]);
}
}
}
}
auto var_output = Tensor::empty(out_shape, input->dtype(), input->device());
var_(var_output, input, dim, unbiased, keepdim);
return var_output;
}
void var_(Tensor var_output, Tensor input, std::vector<size_t> dim, bool unbiased, bool keepdim) {
Var::execute(var_output, input, dim, unbiased, keepdim);
}
} // namespace infinicore::op
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/var.hpp"
#include <infiniop.h>
namespace infinicore::op::var_impl::infiniop {
thread_local common::OpCache<size_t, infiniopVarDescriptor_t> caches(
100, // capacity
[](infiniopVarDescriptor_t &desc) {
if (desc != nullptr) {
INFINICORE_CHECK_ERROR(infiniopDestroyVarDescriptor(desc));
desc = nullptr;
}
});
void calculate(Tensor var_output, Tensor input, std::vector<size_t> dim, bool unbiased, bool keepdim) {
size_t seed = hash_combine(var_output, input, dim.size(), unbiased, keepdim);
auto device_type = context::getDevice().getType();
auto device_index = context::getDevice().getIndex();
auto &cache = caches.getCache(device_type, device_index);
auto desc_opt = cache.get(seed);
infiniopVarDescriptor_t desc = nullptr;
if (!desc_opt) {
INFINICORE_CHECK_ERROR(infiniopCreateVarDescriptor(
context::getInfiniopHandle(var_output->device()), &desc,
var_output->desc(), input->desc(), dim.data(), dim.size(), unbiased, keepdim));
cache.put(seed, desc);
} else {
desc = *desc_opt;
}
size_t workspace_size = 0;
INFINICORE_CHECK_ERROR(infiniopGetVarWorkspaceSize(desc, &workspace_size));
std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
INFINICORE_CHECK_ERROR(infiniopVar(
desc, workspace->data(), workspace_size,
var_output->data(), input->data(), dim.data(), dim.size(), unbiased, keepdim, context::getStream()));
}
static bool registered = []() {
Var::dispatcher().registerDevice({Device::Type::CPU,
Device::Type::NVIDIA,
Device::Type::METAX,
Device::Type::MOORE,
Device::Type::ILUVATAR},
&calculate, false);
return true;
}();
} // namespace infinicore::op::var_impl::infiniop
#include "infinicore/ops/var_mean.hpp"
#include "../../utils.hpp"
#include <stdexcept>
#include <vector>
namespace infinicore::op {
common::OpDispatcher<Var_Mean::schema> &Var_Mean::dispatcher() {
static common::OpDispatcher<Var_Mean::schema> dispatcher_;
return dispatcher_;
};
void Var_Mean::execute(Tensor var_output, Tensor mean_output, Tensor input, std::vector<size_t> dim, bool unbiased, bool keepdim) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(var_output, mean_output, input);
infinicore::context::setDevice(input->device());
auto device_type = context::getDevice().getType();
auto func = dispatcher().lookup(device_type);
if (func == nullptr) {
throw std::runtime_error("No Var_Mean implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
}
func(var_output, mean_output, input, dim, unbiased, keepdim);
}
std::pair<Tensor, Tensor> var_mean(Tensor input, std::vector<size_t> dim, bool unbiased, bool keepdim) {
auto in_shape = input->shape();
std::vector<size_t> out_shape;
if (dim.empty()) {
for (size_t i = 0; i < in_shape.size(); i++) {
dim.push_back(i);
}
}
std::sort(dim.begin(), dim.end());
if (dim.size() == in_shape.size() && !keepdim) {
out_shape = {};
} else {
if (keepdim) {
size_t j = 0;
for (size_t i = 0; i < in_shape.size(); i++) {
if (j < dim.size() && dim[j] == i) {
out_shape.push_back(1);
j++;
} else {
out_shape.push_back(in_shape[i]);
}
}
} else {
size_t j = 0;
for (size_t i = 0; i < in_shape.size(); i++) {
if (j < dim.size() && dim[j] == i) {
j++;
} else {
out_shape.push_back(in_shape[i]);
}
}
}
}
auto var_output = Tensor::empty(out_shape, input->dtype(), input->device());
auto mean_output = Tensor::empty(out_shape, input->dtype(), input->device());
var_mean_(var_output, mean_output, input, dim, unbiased, keepdim);
return {var_output, mean_output};
}
void var_mean_(Tensor var_output, Tensor mean_output, Tensor input, std::vector<size_t> dim, bool unbiased, bool keepdim) {
Var_Mean::execute(var_output, mean_output, input, dim, unbiased, keepdim);
}
} // namespace infinicore::op
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/var_mean.hpp"
#include <infiniop.h>
// todo 实现需要修改calculate函数
namespace infinicore::op::var_mean_impl::infiniop {
thread_local common::OpCache<size_t, infiniopVarMeanDescriptor_t> caches(
100, // capacity
[](infiniopVarMeanDescriptor_t &desc) {
if (desc != nullptr) {
INFINICORE_CHECK_ERROR(infiniopDestroyVarMeanDescriptor(desc));
desc = nullptr;
}
});
void calculate(Tensor var_output, Tensor mean_output, Tensor input, std::vector<size_t> dim, bool unbiased, bool keepdim) {
size_t seed = hash_combine(var_output, mean_output, input, dim.size(), unbiased, keepdim);
auto device_type = context::getDevice().getType();
auto device_index = context::getDevice().getIndex();
auto &cache = caches.getCache(device_type, device_index);
auto desc_opt = cache.get(seed);
infiniopVarMeanDescriptor_t desc = nullptr;
if (!desc_opt) {
INFINICORE_CHECK_ERROR(infiniopCreateVarMeanDescriptor(
context::getInfiniopHandle(var_output->device()), &desc,
var_output->desc(), mean_output->desc(), input->desc(), dim.data(), dim.size(), unbiased, keepdim));
cache.put(seed, desc);
} else {
desc = *desc_opt;
}
size_t workspace_size = 0;
INFINICORE_CHECK_ERROR(infiniopGetVarMeanWorkspaceSize(desc, &workspace_size));
std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
INFINICORE_CHECK_ERROR(infiniopVarMean(
desc, workspace->data(), workspace_size,
var_output->data(), mean_output->data(), input->data(), dim.data(), dim.size(), unbiased, keepdim, context::getStream()));
}
static bool registered = []() {
Var_Mean::dispatcher().registerDevice({Device::Type::CPU,
Device::Type::NVIDIA,
Device::Type::METAX,
Device::Type::MOORE,
Device::Type::ILUVATAR},
&calculate, false);
return true;
}();
} // namespace infinicore::op::var_mean_impl::infiniop
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include "ops/adaptive_max_pool1d.hpp" #include "ops/adaptive_max_pool1d.hpp"
#include "ops/add.hpp" #include "ops/add.hpp"
#include "ops/add_rms_norm.hpp" #include "ops/add_rms_norm.hpp"
#include "ops/all.hpp"
#include "ops/asinh.hpp" #include "ops/asinh.hpp"
#include "ops/attention.hpp" #include "ops/attention.hpp"
#include "ops/avg_pool1d.hpp" #include "ops/avg_pool1d.hpp"
...@@ -33,7 +34,11 @@ ...@@ -33,7 +34,11 @@
#include "ops/rope.hpp" #include "ops/rope.hpp"
#include "ops/silu.hpp" #include "ops/silu.hpp"
#include "ops/silu_and_mul.hpp" #include "ops/silu_and_mul.hpp"
#include "ops/sum.hpp"
#include "ops/swiglu.hpp" #include "ops/swiglu.hpp"
#include "ops/topk.hpp"
#include "ops/var.hpp"
#include "ops/var_mean.hpp"
namespace py = pybind11; namespace py = pybind11;
...@@ -73,6 +78,11 @@ inline void bind(py::module &m) { ...@@ -73,6 +78,11 @@ inline void bind(py::module &m) {
bind_linear_w8a8i8(m); bind_linear_w8a8i8(m);
bind_silu_and_mul(m); bind_silu_and_mul(m);
bind_equal(m); bind_equal(m);
bind_sum(m);
bind_var_mean(m);
bind_var(m);
bind_topk(m);
bind_all(m);
} }
} // namespace infinicore::ops } // namespace infinicore::ops
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/all.hpp"
namespace py = pybind11;
namespace infinicore::ops {
Tensor py_all(Tensor input, py::object dim, bool keepdim) {
if (dim.is_none()) {
std::vector<size_t> dim_vec;
for (int i = 0; i < input->shape().size(); i++) {
dim_vec.push_back(i);
}
return op::all(input, dim_vec, keepdim);
} else if (py::isinstance<py::tuple>(dim) || py::isinstance<py::list>(dim)) {
return op::all(input, dim.cast<std::vector<size_t>>(), keepdim);
} else if (py::isinstance<py::int_>(dim)) {
return op::all(input, std::vector<size_t>(1, dim.cast<size_t>()), keepdim);
} else {
throw std::invalid_argument("dim must be a tuple or an integer");
}
}
void py_all_(Tensor output, Tensor input, py::object dim, bool keepdim) {
if (dim.is_none()) {
std::vector<size_t> dim_vec;
for (int i = 0; i < input->shape().size(); i++) {
dim_vec.push_back(i);
}
op::all_(output, input, dim_vec, keepdim);
} else if (py::isinstance<py::tuple>(dim) || py::isinstance<py::list>(dim)) {
op::all_(output, input, dim.cast<std::vector<size_t>>(), keepdim);
} else if (py::isinstance<py::int_>(dim)) {
op::all_(output, input, std::vector<size_t>(1, dim.cast<size_t>()), keepdim);
} else {
throw std::invalid_argument("dim must be a tuple or an integer");
}
}
inline void bind_all(py::module &m) {
m.def("all",
&py_all,
py::arg("input"),
py::arg("dim"),
py::arg("keepdim"),
R"doc(All of input tensor along the given dimensions.)doc");
m.def("all_",
&py_all_,
py::arg("output"),
py::arg("input"),
py::arg("dim"),
py::arg("keepdim"),
R"doc(In-place tensor all.)doc");
}
} // namespace infinicore::ops
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/sum.hpp"
namespace py = pybind11;
namespace infinicore::ops {
Tensor py_sum(Tensor input, py::object dim, bool keepdim) {
if (dim.is_none()) {
std::vector<size_t> dim_vec;
for (int i = 0; i < input->shape().size(); i++) {
dim_vec.push_back(i);
}
return op::sum(input, dim_vec, keepdim);
} else if (py::isinstance<py::tuple>(dim) || py::isinstance<py::list>(dim)) {
return op::sum(input, dim.cast<std::vector<size_t>>(), keepdim);
} else if (py::isinstance<py::int_>(dim)) {
return op::sum(input, std::vector<size_t>(1, dim.cast<size_t>()), keepdim);
} else {
throw std::invalid_argument("dim must be a tuple or an integer");
}
}
void py_sum_(Tensor output, Tensor input, py::object dim, bool keepdim) {
if (dim.is_none()) {
std::vector<size_t> dim_vec;
for (int i = 0; i < input->shape().size(); i++) {
dim_vec.push_back(i);
}
op::sum_(output, input, dim_vec, keepdim);
} else if (py::isinstance<py::tuple>(dim) || py::isinstance<py::list>(dim)) {
op::sum_(output, input, dim.cast<std::vector<size_t>>(), keepdim);
} else if (py::isinstance<py::int_>(dim)) {
op::sum_(output, input, std::vector<size_t>(1, dim.cast<size_t>()), keepdim);
} else {
throw std::invalid_argument("dim must be a tuple or an integer");
}
}
inline void bind_sum(py::module &m) {
m.def("sum",
&py_sum,
py::arg("input"),
py::arg("dim"),
py::arg("keepdim"),
R"doc(Sum of input tensor along the given dimensions.)doc");
m.def("sum_",
&py_sum_,
py::arg("output"),
py::arg("input"),
py::arg("dim"),
py::arg("keepdim"),
R"doc(In-place tensor sum.)doc");
}
} // namespace infinicore::ops
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl.h> // 添加这行
#include "infinicore/ops/topk.hpp"
namespace py = pybind11;
namespace infinicore::ops {
std::pair<Tensor, Tensor> py_topk(Tensor input, size_t k, int dim, bool largest, bool sorted) {
if (dim == -1) {
return op::topk(input, k, input->ndim() - 1, largest, sorted);
} else if (dim >= 0) {
return op::topk(input, k, static_cast<size_t>(dim), largest, sorted);
} else {
throw std::invalid_argument("invalid argument: dim");
}
}
void py_topk_(Tensor values_output, Tensor indices_output, Tensor input, size_t k, int dim, bool largest, bool sorted) {
if (dim == -1) {
op::topk_(values_output, indices_output, input, k, input->ndim() - 1, largest, sorted);
} else if (dim >= 0) {
op::topk_(values_output, indices_output, input, k, static_cast<size_t>(dim), largest, sorted);
} else {
throw std::invalid_argument("invalid argument: dim");
}
}
inline void bind_topk(py::module &m) {
m.def("topk",
&py_topk,
py::arg("input"),
py::arg("k"),
py::arg("dim"),
py::arg("largest"),
py::arg("sorted"),
R"doc(topk of input tensor along the given dimensions.)doc");
m.def("topk_",
&py_topk_,
py::arg("values_output"),
py::arg("indices_output"),
py::arg("input"),
py::arg("k"),
py::arg("dim"),
py::arg("largest"),
py::arg("sorted"),
R"doc(In-place tensor topk_.)doc");
}
} // namespace infinicore::ops
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/var.hpp"
namespace py = pybind11;
namespace infinicore::ops {
Tensor py_var(Tensor input, py::object dim, bool unbiased, bool keepdim) {
if (dim.is_none()) {
std::vector<size_t> dim_vec;
for (int i = 0; i < input->shape().size(); i++) {
dim_vec.push_back(i);
}
return op::var(input, dim_vec, unbiased, keepdim);
} else if (py::isinstance<py::tuple>(dim) || py::isinstance<py::list>(dim)) {
return op::var(input, dim.cast<std::vector<size_t>>(), unbiased, keepdim);
} else if (py::isinstance<py::int_>(dim)) {
return op::var(input, std::vector<size_t>(1, dim.cast<size_t>()), unbiased, keepdim);
} else {
throw std::invalid_argument("dim must be a tuple or an integer");
}
}
void py_var_(Tensor var_output, Tensor input, py::object dim, bool unbiased, bool keepdim) {
if (dim.is_none()) {
std::vector<size_t> dim_vec;
for (int i = 0; i < input->shape().size(); i++) {
dim_vec.push_back(i);
}
op::var_(var_output, input, dim_vec, unbiased, keepdim);
} else if (py::isinstance<py::tuple>(dim) || py::isinstance<py::list>(dim)) {
op::var_(var_output, input, dim.cast<std::vector<size_t>>(), unbiased, keepdim);
} else if (py::isinstance<py::int_>(dim)) {
op::var_(var_output, input, std::vector<size_t>(1, dim.cast<size_t>()), unbiased, keepdim);
} else {
throw std::invalid_argument("dim must be a list/tuple or an integer");
}
}
inline void bind_var(py::module &m) {
m.def("var",
&py_var,
py::arg("input"),
py::arg("dim"),
py::arg("unbiased"),
py::arg("keepdim"),
R"doc(Var of input tensor along the given dimensions.)doc");
m.def("var_",
&py_var_,
py::arg("var_output"),
py::arg("input"),
py::arg("dim"),
py::arg("unbiased"),
py::arg("keepdim"),
R"doc(In-place tensor Var .)doc");
}
} // namespace infinicore::ops
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/var_mean.hpp"
namespace py = pybind11;
namespace infinicore::ops {
std::pair<Tensor, Tensor> py_var_mean(Tensor input, py::object dim, bool unbiased, bool keepdim) {
if (dim.is_none()) {
std::vector<size_t> dim_vec;
for (int i = 0; i < input->shape().size(); i++) {
dim_vec.push_back(i);
}
return op::var_mean(input, dim_vec, unbiased, keepdim);
} else if (py::isinstance<py::tuple>(dim) || py::isinstance<py::list>(dim)) {
return op::var_mean(input, dim.cast<std::vector<size_t>>(), unbiased, keepdim);
} else if (py::isinstance<py::int_>(dim)) {
return op::var_mean(input, std::vector<size_t>(1, dim.cast<size_t>()), unbiased, keepdim);
} else {
throw std::invalid_argument("dim must be a tuple or an integer");
}
}
void py_var_mean_(Tensor var_output, Tensor mean_output, Tensor input, py::object dim, bool unbiased, bool keepdim) {
if (dim.is_none()) {
std::vector<size_t> dim_vec;
for (int i = 0; i < input->shape().size(); i++) {
dim_vec.push_back(i);
}
op::var_mean_(var_output, mean_output, input, dim_vec, unbiased, keepdim);
} else if (py::isinstance<py::tuple>(dim) || py::isinstance<py::list>(dim)) {
op::var_mean_(var_output, mean_output, input, dim.cast<std::vector<size_t>>(), unbiased, keepdim);
} else if (py::isinstance<py::int_>(dim)) {
op::var_mean_(var_output, mean_output, input, std::vector<size_t>(1, dim.cast<size_t>()), unbiased, keepdim);
} else {
throw std::invalid_argument("dim must be a list/tuple or an integer");
}
}
inline void bind_var_mean(py::module &m) {
m.def("var_mean",
&py_var_mean,
py::arg("input"),
py::arg("dim"),
py::arg("unbiased"),
py::arg("keepdim"),
R"doc(Var & Mean of input tensor along the given dimensions.)doc");
m.def("var_mean_",
&py_var_mean_,
py::arg("var_output"),
py::arg("mean_output"),
py::arg("input"),
py::arg("dim"),
py::arg("unbiased"),
py::arg("keepdim"),
R"doc(In-place tensor Var & Mean .)doc");
}
} // namespace infinicore::ops
#ifndef INFINIOP_ALL_DESCRIPTOR_H_
#define INFINIOP_ALL_DESCRIPTOR_H_
#include "../../../utils.h"
#include "../../operator.h"
#include "../../tensor.h"
#include "info.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::all::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
AllInfo _info; \
size_t _workspace_size; \
\
Descriptor( \
Opaque *opaque, \
AllInfo info, \
size_t workspace_size, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size) {} \
\
public: \
~Descriptor(); \
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t output_desc, \
infiniopTensorDescriptor_t input_desc, \
size_t *dim, \
size_t dim_size, \
bool keepdim); \
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
void *output, \
const void *input, \
size_t *dim, \
size_t dim_size, \
bool keepdim, \
void *stream) const; \
}; \
}
#endif
#include "all_cpu.h"
#include "../../../../utils.h"
#include "../../../devices/cpu/common_cpu.h"
#include <iostream>
namespace op::all::cpu {
Descriptor::~Descriptor() {}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
size_t *dim,
size_t dim_size,
bool keepdim) {
auto result = AllInfo::create(output_desc, input_desc, dim, dim_size, keepdim);
CHECK_RESULT(result);
*desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
namespace {
template <typename Tdata>
infiniStatus_t calculateAll(
const AllInfo &info,
bool *output,
const Tdata *input,
size_t *dim,
size_t dim_size,
bool keepdim) {
if (info.reduce_dim_size == info.ndim) {
bool result = true;
for (size_t index = 0; index < info.input_size; index++) {
size_t input_offset = op::common_cpu::indexToOffset(index, info.ndim, info.permuted_input_shape.data(), info.permuted_input_strides.data());
result = result && input[input_offset];
}
output[0] = result;
return INFINI_STATUS_SUCCESS;
} else {
for (size_t i = info.output_size; i-- > 0;) {
size_t output_offset = op::common_cpu::indexToOffset(i, info.output_shape.size(), info.output_shape.data(), info.output_strides.data());
bool result = true;
for (size_t j = 0; j < info.reduce_num; j++) {
size_t input_flat = j + i * info.reduce_num;
size_t input_offset = op::common_cpu::indexToOffset(input_flat, info.ndim, info.permuted_input_shape.data(), info.permuted_input_strides.data());
Tdata input_val = input[input_offset];
bool bool_val = static_cast<bool>(input_val);
result = result && bool_val;
}
output[output_offset] = result;
}
return INFINI_STATUS_SUCCESS;
}
}
} // namespace
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
const void *input,
size_t *dim,
size_t dim_size,
bool keepdim,
void *stream) const {
switch (_info.dtype) {
case INFINI_DTYPE_BOOL:
return calculateAll<bool>(_info, reinterpret_cast<bool *>(output), reinterpret_cast<const bool *>(input), dim, dim_size, keepdim);
case INFINI_DTYPE_U8:
return calculateAll<uint8_t>(_info, reinterpret_cast<bool *>(output), reinterpret_cast<const uint8_t *>(input), dim, dim_size, keepdim);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::all::cpu
#ifndef __INFINIOP_ALL_CPU_H__
#define __INFINIOP_ALL_CPU_H__
#include "../all_desc.h"
DESCRIPTOR(cpu);
#endif // __INFINIOP_ALL_CPU_H__
#ifndef __ALL_CUDA_H__
#define __ALL_CUDA_H__
__forceinline__ __device__ __host__ size_t
indexToOffset(
size_t flat_index,
size_t ndim,
const size_t *shape,
const ptrdiff_t *strides) {
size_t res = 0;
for (size_t i = ndim; i-- > 0;) {
res += (flat_index % shape[i]) * strides[i];
flat_index /= shape[i];
}
return res;
}
template <size_t BLOCK_SIZE, typename Tdata>
__global__ void allReduceTempKernel(
bool *temp_output,
const Tdata *input,
size_t input_size,
size_t permuted_input_shape_size,
size_t *permuted_input_shape,
ptrdiff_t *permuted_input_strides) {
__shared__ bool s_data[BLOCK_SIZE];
size_t tid = threadIdx.x;
size_t idx = tid + blockIdx.x * blockDim.x;
if (idx < input_size) {
size_t input_offset = indexToOffset(idx, permuted_input_shape_size, permuted_input_shape, permuted_input_strides);
s_data[tid] = static_cast<bool>(input[input_offset]);
} else {
s_data[tid] = true;
}
__syncthreads();
for (size_t s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) {
s_data[tid] = s_data[tid] && s_data[tid + s];
}
__syncthreads();
}
if (tid == 0) {
temp_output[blockIdx.x] = s_data[0];
}
}
template <size_t BLOCK_SIZE>
__global__ void finalAllReduceKernel(
bool *output,
const bool *block_results,
size_t num_blocks) {
__shared__ bool s_data[BLOCK_SIZE];
size_t tid = threadIdx.x;
bool thread_val = true;
for (size_t i = tid; i < num_blocks; i += blockDim.x) {
thread_val = thread_val && block_results[i];
}
s_data[tid] = thread_val;
__syncthreads();
for (size_t s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
if (tid < s) {
s_data[tid] = s_data[tid] && s_data[tid + s];
}
__syncthreads();
}
if (tid == 0) {
*output = s_data[0];
}
}
template <size_t BLOCK_SIZE, typename Tdata>
__global__ void allKernel(
bool *output,
const Tdata *input,
size_t permuted_input_shape_size,
size_t output_shape_size,
size_t output_size,
size_t reduce_num,
size_t *permuted_input_shape,
size_t *output_shape,
ptrdiff_t *permuted_input_strides,
ptrdiff_t *output_strides) {
size_t tid = threadIdx.x;
size_t idx = tid + blockIdx.x * blockDim.x;
if (idx >= output_size) {
return;
}
size_t output_index = indexToOffset(idx, output_shape_size, output_shape, output_strides);
bool tempRes = true;
for (size_t i = 0; i < reduce_num; i++) {
size_t input_offset = indexToOffset(i + idx * reduce_num, permuted_input_shape_size, permuted_input_shape, permuted_input_strides);
tempRes = tempRes && static_cast<bool>(input[input_offset]);
}
output[output_index] = tempRes;
}
#endif // __ALL_CUDA_H__
#ifndef __ALL_INFO_H__
#define __ALL_INFO_H__
#include "../../../utils.h"
#include "../../tensor.h"
#include <algorithm>
#include <cstddef>
#include <vector>
namespace op::all {
class AllInfo {
AllInfo() = default;
public:
infiniDtype_t dtype;
std::vector<size_t> permuted_input_shape; // need to permute
std::vector<size_t> output_shape;
std::vector<ptrdiff_t> permuted_input_strides; // need to permute
std::vector<ptrdiff_t> output_strides;
size_t reduce_dim_size; // reduce dim size
size_t reduce_num; // number of elements to reduce for each output element
size_t input_size; // total number of input elements
size_t output_size; // total number of output elements
size_t ndim; // number of dimensions
static utils::Result<AllInfo> create(
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
size_t *dim,
size_t dim_size,
bool keepdim) {
auto input_shape = input_desc->shape();
auto input_strides = input_desc->strides();
size_t input_ndim = input_desc->ndim();
size_t reduce_num = 1;
for (size_t i = 0; i < dim_size; i++) {
reduce_num *= input_shape[dim[i]];
}
std::vector<size_t> permute_order;
for (size_t i = 0; i < input_ndim; i++) {
if (std::find(dim, dim + dim_size, i) == dim + dim_size) {
permute_order.push_back(i);
}
}
for (size_t i = 0; i < dim_size; i++) {
permute_order.push_back(dim[i]);
}
std::vector<size_t> permuted_input_shape;
std::vector<ptrdiff_t> permuted_input_strides;
for (size_t i = 0; i < permute_order.size(); i++) {
permuted_input_shape.push_back(input_shape[permute_order[i]]);
permuted_input_strides.push_back(input_strides[permute_order[i]]);
}
return utils::Result<AllInfo>(AllInfo{input_desc->dtype(),
permuted_input_shape,
output_desc->shape(),
permuted_input_strides,
output_desc->strides(),
dim_size,
reduce_num,
input_desc->numel(),
output_desc->numel(),
input_ndim});
}
};
} // namespace op::all
#endif
#ifndef __ALL_METAX_H__
#define __ALL_METAX_H__
#include "../all_desc.h"
DESCRIPTOR(metax);
#endif
#include "../../../devices/metax/metax_common.h"
#include "../../../devices/metax/metax_kernel_common.h"
#include "../cuda/kernel.cuh"
#include "all_metax.h"
namespace op::all::metax {
struct Descriptor::Opaque {
std::shared_ptr<device::metax::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t output_desc,
infiniopTensorDescriptor_t input_desc,
size_t *dim,
size_t dim_size,
bool keepdim) {
auto result = AllInfo::create(output_desc, input_desc, dim, dim_size, keepdim);
CHECK_RESULT(result);
auto info = result.take();
size_t workspace_size = 0;
workspace_size += (input_desc->ndim() + output_desc->ndim()) * (sizeof(size_t) + sizeof(ptrdiff_t));
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
info, workspace_size, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
namespace {
template <size_t BLOCK_SIZE, typename Tdata>
infiniStatus_t launchKernel(
const AllInfo &info,
bool *output, const Tdata *input,
hcStream_t stream, void *workspace, size_t workspace_size) {
size_t input_ndim = info.permuted_input_shape.size();
size_t output_ndim = info.output_shape.size();
size_t input_size = info.input_size;
size_t output_size = info.output_size;
size_t reduce_num = info.reduce_num;
unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
size_t workspace_offset = 0;
size_t *permuted_input_shape_hc = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
size_t *output_shape_hc = permuted_input_shape_hc + input_ndim;
workspace_offset += (input_ndim + output_ndim) * sizeof(size_t);
ptrdiff_t *permuted_input_strides_hc = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
ptrdiff_t *output_strides_hc = permuted_input_strides_hc + input_ndim;
workspace_offset += (input_ndim + output_ndim) * sizeof(ptrdiff_t);
CHECK_METAX(hcMemcpyAsync(permuted_input_shape_hc, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream));
CHECK_METAX(hcMemcpyAsync(output_shape_hc, info.output_shape.data(), output_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream));
CHECK_METAX(hcMemcpyAsync(permuted_input_strides_hc, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream));
CHECK_METAX(hcMemcpyAsync(output_strides_hc, info.output_strides.data(), output_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream));
if (info.reduce_num == input_size) {
size_t grid_size = (input_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
bool *temp_output;
CHECK_METAX(hcMalloc(&temp_output, grid_size * sizeof(bool)));
allReduceTempKernel<BLOCK_SIZE, Tdata><<<grid_size, BLOCK_SIZE, BLOCK_SIZE * sizeof(bool), stream>>>(
temp_output, input, input_size, input_ndim, permuted_input_shape_hc, permuted_input_strides_hc);
finalAllReduceKernel<BLOCK_SIZE><<<1, BLOCK_SIZE>>>(output, temp_output, grid_size);
CHECK_METAX(hcFree(temp_output));
} else {
size_t grid_size = (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
allKernel<BLOCK_SIZE, Tdata><<<grid_size, BLOCK_SIZE, 0, stream>>>(
output, input, input_ndim, output_ndim, output_size, reduce_num,
permuted_input_shape_hc, output_shape_hc, permuted_input_strides_hc, output_strides_hc);
}
return INFINI_STATUS_SUCCESS;
}
} // namespace
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
const void *input,
size_t *dim,
size_t dim_size,
bool keepdim,
void *stream_) const {
hcStream_t stream = (hcStream_t)stream_;
#define CALCULATE_ALL(BLOCK_SIZE, Tdata) \
launchKernel<BLOCK_SIZE, Tdata>( \
_info, \
(bool *)output, (const Tdata *)input, \
stream, workspace, workspace_size)
#define CALCULATE_ALL_WITH_BLOCK_SIZE(BLOCK_SIZE) \
{ \
if (_info.dtype == INFINI_DTYPE_BOOL) \
return CALCULATE_ALL(BLOCK_SIZE, bool); \
else if (_info.dtype == INFINI_DTYPE_U8) \
return CALCULATE_ALL(BLOCK_SIZE, uint8_t); \
else \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
if (_opaque->internal->maxThreadsPerBlock() >= 256) {
CALCULATE_ALL_WITH_BLOCK_SIZE(256)
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::all::metax
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment