Commit 45a3794b authored by wooway777's avatar wooway777
Browse files

issue/1031 T1-1-17

parent cb7f0b7d
......@@ -17,12 +17,12 @@ def run_tests(args):
"causal_softmax.py",
"clip.py",
"conv.py",
#"dequantize_awq.py",
# "dequantize_awq.py",
"gelu.py",
"gemm.py",
#"layer_norm.py",
# "layer_norm.py",
"logsoftmax.py",
#"lp_norm.py",
# "lp_norm.py",
"mul.py",
"ones.py",
"random_sample.py",
......@@ -31,7 +31,7 @@ def run_tests(args):
"rms_norm.py",
"rope.py",
"sigmoid.py",
#"softmax.py",
# "softmax.py",
"softplus.py",
"sub.py",
"swiglu.py",
......@@ -42,6 +42,7 @@ def run_tests(args):
# "paged_attention.py",
# "paged_caching.py",
# "paged_attention_prefill.py"
"cross_entropy.py",
]:
result = subprocess.run(
f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True
......
#include "infinicore/ops/avg_pool1d.hpp"
#include "../../utils.hpp"
#include <stdexcept>
namespace infinicore::op {
common::OpDispatcher<AvgPool1d::schema> &AvgPool1d::dispatcher() {
static common::OpDispatcher<AvgPool1d::schema> dispatcher_;
return dispatcher_;
}
void AvgPool1d::execute(
Tensor output,
Tensor input,
size_t kernel_size,
size_t stride,
size_t padding) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input);
if (stride == 0) {
stride = kernel_size;
}
infinicore::context::setDevice(output->device());
auto device_type = output->device().getType();
auto func = dispatcher().lookup(device_type);
if (func == nullptr) {
throw std::runtime_error(
"No AvgPool1d implementation for device type: " + std::to_string(static_cast<int>(device_type)));
}
func(output, input, kernel_size, stride, padding);
}
Tensor avg_pool1d(Tensor input, size_t kernel_size, size_t stride, size_t padding) {
if (stride == 0) {
stride = kernel_size;
}
const auto &shape = input->shape();
if (shape.size() != 3) {
throw std::runtime_error("AvgPool1d expects tensors with shape [N, C, L]");
}
const size_t n = shape[0];
const size_t c = shape[1];
const size_t l_in = shape[2];
if (l_in + 2 * padding < kernel_size) {
throw std::runtime_error("AvgPool1d kernel_size is larger than padded length");
}
const size_t out_width = (l_in + 2 * padding - kernel_size) / stride + 1;
Shape out_shape = {n, c, out_width};
auto output = Tensor::empty(out_shape, input->dtype(), input->device());
avg_pool1d_(output, input, kernel_size, stride, padding);
return output;
}
void avg_pool1d_(Tensor output, Tensor input, size_t kernel_size, size_t stride, size_t padding) {
AvgPool1d::execute(output, input, kernel_size, stride, padding);
}
} // namespace infinicore::op
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/avg_pool1d.hpp"
#include "infinicore/ops/common/cache.hpp"
#include <infiniop.h>
namespace infinicore::op::avg_pool1d_impl::infiniop {
thread_local common::OpCache<size_t, infiniopAvgPool1dDescriptor_t> caches(
100,
[](infiniopAvgPool1dDescriptor_t &desc) {
if (desc != nullptr) {
INFINICORE_CHECK_ERROR(infiniopDestroyAvgPool1dDescriptor(desc));
desc = nullptr;
}
});
void calculate(
Tensor output,
Tensor input,
size_t kernel_size,
size_t stride,
size_t padding) {
if (stride == 0) {
stride = kernel_size;
}
size_t seed = hash_combine(output, input, kernel_size, stride, padding);
auto device = context::getDevice();
auto &cache = caches.getCache(device);
auto desc_opt = cache.get(seed);
infiniopAvgPool1dDescriptor_t desc = nullptr;
if (!desc_opt) {
INFINICORE_CHECK_ERROR(infiniopCreateAvgPool1dDescriptor(
context::getInfiniopHandle(device),
&desc,
output->desc(),
input->desc(),
kernel_size,
stride,
padding));
cache.put(seed, desc);
} else {
desc = *desc_opt;
}
size_t workspace_size = 0;
INFINICORE_CHECK_ERROR(infiniopGetAvgPool1dWorkspaceSize(desc, &workspace_size));
std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
INFINICORE_CHECK_ERROR(infiniopAvgPool1d(
desc,
workspace->data(),
workspace_size,
output->data(),
input->data(),
context::getStream()));
}
static bool registered = []() {
AvgPool1d::dispatcher().registerAll(&calculate, false);
return true;
}();
} // namespace infinicore::op::avg_pool1d_impl::infiniop
#include "infinicore/ops/cross_entropy.hpp"
#include "../../utils.hpp"
#include <stdexcept>
namespace infinicore::op {
common::OpDispatcher<CrossEntropy::schema> &CrossEntropy::dispatcher() {
static common::OpDispatcher<CrossEntropy::schema> dispatcher_;
return dispatcher_;
};
void CrossEntropy::execute(Tensor output, Tensor input, Tensor target) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input);
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(input, target);
infinicore::context::setDevice(output->device());
auto device_type = output->device().getType();
auto func = dispatcher().lookup(device_type);
if (func == nullptr) {
throw std::runtime_error("No CrossEntropy implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
}
func(output, input, target);
}
Tensor cross_entropy(Tensor input, Tensor target) {
Shape shape = target->shape();
auto output = Tensor::empty(shape, input->dtype(), input->device());
cross_entropy_(output, input, target);
return output;
}
void cross_entropy_(Tensor output, Tensor input, Tensor target) {
CrossEntropy::execute(output, input, target);
}
} // namespace infinicore::op
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/cross_entropy.hpp"
#include <infiniop.h>
namespace infinicore::op::cross_entropy_impl::infiniop {
thread_local common::OpCache<size_t, infiniopCrossEntropyDescriptor_t> caches(
100,
[](infiniopCrossEntropyDescriptor_t &desc) {
if (desc != nullptr) {
INFINICORE_CHECK_ERROR(infiniopDestroyCrossEntropyDescriptor(desc));
desc = nullptr;
}
});
void calculate(Tensor output, Tensor input, Tensor target) {
size_t seed = hash_combine(output, input, target);
auto device = context::getDevice();
auto &cache = caches.getCache(device);
auto desc_opt = cache.get(seed);
infiniopCrossEntropyDescriptor_t desc = nullptr;
if (!desc_opt) {
INFINICORE_CHECK_ERROR(infiniopCreateCrossEntropyDescriptor(
context::getInfiniopHandle(device),
&desc,
output->desc(),
input->desc(),
target->desc()));
cache.put(seed, desc);
} else {
desc = *desc_opt;
}
size_t workspace_size = 0;
INFINICORE_CHECK_ERROR(infiniopGetCrossEntropyWorkspaceSize(desc, &workspace_size));
std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
INFINICORE_CHECK_ERROR(infiniopCrossEntropy(
desc,
workspace->data(),
workspace_size,
output->data(),
input->data(),
target->data(),
context::getStream()));
}
static bool registered = []() {
CrossEntropy::dispatcher().registerAll(&calculate, false);
return true;
}();
} // namespace infinicore::op::cross_entropy_impl::infiniop
#include "infinicore/ops/equal.hpp"
#include "../../utils.hpp"
namespace infinicore::op {
common::OpDispatcher<Equal::schema> &Equal::dispatcher() {
static common::OpDispatcher<Equal::schema> dispatcher_;
return dispatcher_;
};
void Equal::execute(Tensor out, Tensor a, Tensor b) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, a, b);
infinicore::context::setDevice(out->device());
dispatcher().lookup(out->device().getType())(out, a, b);
}
Tensor equal(Tensor a, Tensor b) {
auto out = Tensor::empty(a->shape(), DataType::BOOL, a->device());
equal_(out, a, b);
return out;
}
void equal_(Tensor out, Tensor a, Tensor b) {
if (out->dtype() != DataType::BOOL) {
throw std::runtime_error("Equal expects bool output tensor.");
}
Equal::execute(out, a, b);
}
} // namespace infinicore::op
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/equal.hpp"
#include <infiniop.h>
namespace infinicore::op::equal_impl::infiniop {
thread_local common::OpCache<size_t, infiniopEqualDescriptor_t> caches(
100,
[](infiniopEqualDescriptor_t &desc) {
if (desc != nullptr) {
INFINICORE_CHECK_ERROR(infiniopDestroyEqualDescriptor(desc));
desc = nullptr;
}
});
void calculate(Tensor out, Tensor a, Tensor b) {
size_t seed = hash_combine(out, a, b);
auto device = context::getDevice();
auto &cache = caches.getCache(device);
infiniopEqualDescriptor_t desc = nullptr;
if (auto cached = cache.get(seed)) {
desc = *cached;
} else {
INFINICORE_CHECK_ERROR(infiniopCreateEqualDescriptor(
context::getInfiniopHandle(device), &desc,
out->desc(), a->desc(), b->desc()));
cache.put(seed, desc);
}
size_t workspace_size = 0;
INFINICORE_CHECK_ERROR(infiniopGetEqualWorkspaceSize(desc, &workspace_size));
std::shared_ptr<Memory> workspace;
void *workspace_ptr = nullptr;
if (workspace_size != 0) {
workspace = context::allocateMemory(workspace_size);
workspace_ptr = workspace->data();
}
INFINICORE_CHECK_ERROR(infiniopEqual(
desc,
workspace_ptr,
workspace_size,
out->data(),
a->data(),
b->data(),
context::getStream()));
}
static bool registered = []() {
Equal::dispatcher().registerAll(&calculate, false);
return true;
}();
} // namespace infinicore::op::equal_impl::infiniop
#include "infinicore/ops/hardswish.hpp"
#include "../../utils.hpp"
#include <stdexcept>
namespace infinicore::op {
common::OpDispatcher<Hardswish::schema> &Hardswish::dispatcher() {
static common::OpDispatcher<Hardswish::schema> dispatcher_;
return dispatcher_;
}
void Hardswish::execute(Tensor output, Tensor input) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input);
infinicore::context::setDevice(output->device());
auto device_type = output->device().getType();
auto func = dispatcher().lookup(device_type);
if (func == nullptr) {
throw std::runtime_error(
"No Hardswish implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
}
func(output, input);
}
Tensor hardswish(Tensor input) {
auto output = Tensor::empty(input->shape(), input->dtype(), input->device());
hardswish_(output, input);
return output;
}
void hardswish_(Tensor output, Tensor input) {
Hardswish::execute(output, input);
}
} // namespace infinicore::op
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/hardswish.hpp"
#include <infiniop.h>
namespace infinicore::op::hardswish_impl::infiniop {
thread_local common::OpCache<size_t, infiniopHardSwishDescriptor_t> caches(
100,
[](infiniopHardSwishDescriptor_t &desc) {
if (desc != nullptr) {
INFINICORE_CHECK_ERROR(infiniopDestroyHardSwishDescriptor(desc));
desc = nullptr;
}
});
void calculate(Tensor output, Tensor input) {
size_t seed = hash_combine(output, input);
auto device = context::getDevice();
auto &cache = caches.getCache(device);
auto desc_opt = cache.get(seed);
infiniopHardSwishDescriptor_t desc = nullptr;
if (!desc_opt) {
INFINICORE_CHECK_ERROR(infiniopCreateHardSwishDescriptor(
context::getInfiniopHandle(device),
&desc,
output->desc(),
input->desc()));
cache.put(seed, desc);
} else {
desc = *desc_opt;
}
size_t workspace_size = 0;
INFINICORE_CHECK_ERROR(infiniopGetHardSwishWorkspaceSize(desc, &workspace_size));
std::shared_ptr<Memory> workspace;
void *workspace_ptr = nullptr;
if (workspace_size != 0) {
workspace = context::allocateMemory(workspace_size);
workspace_ptr = workspace->data();
}
INFINICORE_CHECK_ERROR(infiniopHardSwish(
desc,
workspace_ptr,
workspace_size,
output->data(),
input->data(),
context::getStream()));
}
static bool registered = []() {
Hardswish::dispatcher().registerAll(&calculate, false);
return true;
}();
} // namespace infinicore::op::hardswish_impl::infiniop
#include "infinicore/ops/hardtanh.hpp"
#include "../../utils.hpp"
#include <stdexcept>
namespace infinicore::op {
common::OpDispatcher<HardTanh::schema> &HardTanh::dispatcher() {
static common::OpDispatcher<HardTanh::schema> dispatcher_;
return dispatcher_;
}
void HardTanh::execute(Tensor output, Tensor input, float min_val, float max_val) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input);
infinicore::context::setDevice(output->device());
auto device_type = output->device().getType();
auto func = dispatcher().lookup(device_type);
if (func == nullptr) {
throw std::runtime_error(
"No HardTanh implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
}
func(output, input, min_val, max_val);
}
Tensor hardtanh(Tensor input, float min_val, float max_val) {
auto output = Tensor::empty(input->shape(), input->dtype(), input->device());
hardtanh_(output, input, min_val, max_val);
return output;
}
void hardtanh_(Tensor output, Tensor input, float min_val, float max_val) {
HardTanh::execute(output, input, min_val, max_val);
}
} // namespace infinicore::op
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/hardtanh.hpp"
#include <infiniop.h>
namespace infinicore::op::hardtanh_impl::infiniop {
thread_local common::OpCache<size_t, infiniopHardTanhDescriptor_t> caches(
100,
[](infiniopHardTanhDescriptor_t &desc) {
if (desc != nullptr) {
INFINICORE_CHECK_ERROR(infiniopDestroyHardTanhDescriptor(desc));
desc = nullptr;
}
});
void calculate(Tensor output, Tensor input, float min_val, float max_val) {
size_t seed = hash_combine(output, input, min_val, max_val);
auto device = context::getDevice();
auto &cache = caches.getCache(device);
auto desc_opt = cache.get(seed);
infiniopHardTanhDescriptor_t desc = nullptr;
if (!desc_opt) {
INFINICORE_CHECK_ERROR(infiniopCreateHardTanhDescriptor(
context::getInfiniopHandle(device),
&desc,
output->desc(),
input->desc(),
min_val,
max_val));
cache.put(seed, desc);
} else {
desc = *desc_opt;
}
size_t workspace_size = 0;
INFINICORE_CHECK_ERROR(infiniopGetHardTanhWorkspaceSize(desc, &workspace_size));
std::shared_ptr<Memory> workspace;
void *workspace_ptr = nullptr;
if (workspace_size != 0) {
workspace = context::allocateMemory(workspace_size);
workspace_ptr = workspace->data();
}
INFINICORE_CHECK_ERROR(infiniopHardTanh(
desc,
workspace_ptr,
workspace_size,
output->data(),
input->data(),
context::getStream()));
}
static bool registered = []() {
HardTanh::dispatcher().registerAll(&calculate, false);
return true;
}();
} // namespace infinicore::op::hardtanh_impl::infiniop
......@@ -6,9 +6,14 @@
#include "ops/add_rms_norm.hpp"
#include "ops/all.hpp"
#include "ops/attention.hpp"
#include "ops/avg_pool1d.hpp"
#include "ops/causal_softmax.hpp"
#include "ops/cross_entropy.hpp"
#include "ops/embedding.hpp"
#include "ops/equal.hpp"
#include "ops/flash_attention.hpp"
#include "ops/hardswish.hpp"
#include "ops/hardtanh.hpp"
#include "ops/kv_caching.hpp"
#include "ops/linear.hpp"
#include "ops/linear_w8a8i8.hpp"
......@@ -45,12 +50,16 @@ inline void bind(py::module &m) {
bind_matmul(m);
bind_mul(m);
bind_mha_varlen(m);
bind_hardswish(m);
bind_hardtanh(m);
bind_paged_attention(m);
bind_paged_attention_prefill(m);
bind_paged_caching(m);
bind_random_sample(m);
bind_cross_entropy(m);
bind_rearrange(m);
bind_rms_norm(m);
bind_avg_pool1d(m);
bind_silu(m);
bind_swiglu(m);
bind_rope(m);
......@@ -62,6 +71,7 @@ inline void bind(py::module &m) {
bind_var(m);
bind_topk(m);
bind_all(m);
bind_equal(m);
}
} // namespace infinicore::ops
#pragma once
#include <optional>
#include <pybind11/pybind11.h>
#include "infinicore/ops/avg_pool1d.hpp"
namespace py = pybind11;
namespace infinicore::ops {
inline void bind_avg_pool1d(py::module &m) {
m.def(
"avg_pool1d",
[](::infinicore::Tensor input, size_t kernel_size, std::optional<size_t> stride, size_t padding) {
return op::avg_pool1d(input, kernel_size, stride.value_or(0), padding);
},
py::arg("input"),
py::arg("kernel_size"),
py::arg("stride") = py::none(),
py::arg("padding") = 0,
R"doc(AvgPool1d out-of-place.)doc");
m.def(
"avg_pool1d_",
[](::infinicore::Tensor output, ::infinicore::Tensor input, size_t kernel_size, std::optional<size_t> stride, size_t padding) {
op::avg_pool1d_(output, input, kernel_size, stride.value_or(0), padding);
},
py::arg("output"),
py::arg("input"),
py::arg("kernel_size"),
py::arg("stride") = py::none(),
py::arg("padding") = 0,
R"doc(AvgPool1d in-place variant writing to provided output tensor.)doc");
}
} // namespace infinicore::ops
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/cross_entropy.hpp"
namespace py = pybind11;
namespace infinicore::ops {
inline void bind_cross_entropy(py::module &m) {
m.def("cross_entropy",
&op::cross_entropy,
py::arg("logits"),
py::arg("target"),
R"doc(Token-wise cross entropy loss without reduction.)doc");
m.def("cross_entropy_",
&op::cross_entropy_,
py::arg("loss"),
py::arg("logits"),
py::arg("target"),
R"doc(Write cross entropy loss into a provided tensor.)doc");
}
} // namespace infinicore::ops
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/equal.hpp"
namespace py = pybind11;
namespace infinicore::ops {
inline void bind_equal(py::module &m) {
m.def("equal",
&op::equal,
py::arg("a"),
py::arg("b"),
R"doc(Elementwise equality returning a bool tensor.)doc");
m.def("equal_",
&op::equal_,
py::arg("out"),
py::arg("a"),
py::arg("b"),
R"doc(In-place elementwise equality writing into `out`.)doc");
}
} // namespace infinicore::ops
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/hardswish.hpp"
namespace py = pybind11;
namespace infinicore::ops {
inline void bind_hardswish(py::module &m) {
m.def("hardswish",
&op::hardswish,
py::arg("input"),
R"doc(Out-of-place Hardswish activation.)doc");
m.def("hardswish_",
&op::hardswish_,
py::arg("output"),
py::arg("input"),
R"doc(In-place Hardswish activation.)doc");
}
} // namespace infinicore::ops
#pragma once
#include <pybind11/pybind11.h>
#include "infinicore/ops/hardtanh.hpp"
namespace py = pybind11;
namespace infinicore::ops {
inline void bind_hardtanh(py::module &m) {
m.def("hardtanh",
&op::hardtanh,
py::arg("input"),
py::arg("min_val") = -1.0f,
py::arg("max_val") = 1.0f,
R"doc(Apply the HardTanh activation.)doc");
m.def("hardtanh_",
&op::hardtanh_,
py::arg("output"),
py::arg("input"),
py::arg("min_val") = -1.0f,
py::arg("max_val") = 1.0f,
R"doc(In-place HardTanh activation.)doc");
}
} // namespace infinicore::ops
#ifndef __AVG_POOL1D_H__
#define __AVG_POOL1D_H__
#include "../../../utils.h"
#include "../../operator.h"
#include "../../tensor.h"
#include "infiniop/ops/avg_pool1d.h"
#define DESCRIPTOR(NAMESPACE) \
namespace op::avg_pool1d::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
AvgPool1dInfo _info; \
size_t _workspace_size; \
\
Descriptor( \
AvgPool1dInfo info, \
size_t workspace_size_, \
Opaque *opaque, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size_) {} \
\
public: \
~Descriptor(); \
\
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t y_desc, \
infiniopTensorDescriptor_t x_desc, \
size_t kernel_size, \
size_t stride, \
size_t padding); \
\
infiniStatus_t calculate( \
void *workspace, \
size_t workspace_size, \
void *y, \
const void *x, \
void *stream) const; \
}; \
}
class AvgPool1dInfo {
private:
AvgPool1dInfo() = default;
public:
infiniDtype_t dtype;
size_t batch, channels, in_width, out_width;
size_t kernel_size, stride, padding;
ptrdiff_t y_stride_batch, y_stride_channel, y_stride_width;
ptrdiff_t x_stride_batch, x_stride_channel, x_stride_width;
static utils::Result<AvgPool1dInfo> createAvgPool1dInfo(
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
size_t kernel_size,
size_t stride,
size_t padding) {
CHECK_OR_RETURN(y_desc != nullptr && x_desc != nullptr, INFINI_STATUS_NULL_POINTER);
const infiniDtype_t dtype = y_desc->dtype();
CHECK_OR_RETURN(dtype == x_desc->dtype(), INFINI_STATUS_BAD_TENSOR_DTYPE);
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
CHECK_OR_RETURN(y_desc->ndim() == 3 && x_desc->ndim() == 3, INFINI_STATUS_BAD_TENSOR_SHAPE);
size_t batch = x_desc->dim(0);
size_t channels = x_desc->dim(1);
size_t in_width = x_desc->dim(2);
CHECK_OR_RETURN(y_desc->dim(0) == batch, INFINI_STATUS_BAD_TENSOR_SHAPE);
CHECK_OR_RETURN(y_desc->dim(1) == channels, INFINI_STATUS_BAD_TENSOR_SHAPE);
size_t padded_len = in_width + 2 * padding;
CHECK_OR_RETURN(padded_len >= kernel_size, INFINI_STATUS_BAD_TENSOR_SHAPE);
size_t expected_out_width = (padded_len - kernel_size) / stride + 1;
CHECK_OR_RETURN(y_desc->dim(2) == expected_out_width, INFINI_STATUS_BAD_TENSOR_SHAPE);
size_t out_width = expected_out_width;
return utils::Result<AvgPool1dInfo>(AvgPool1dInfo{
dtype,
batch, channels, in_width, out_width,
kernel_size, stride, padding,
y_desc->stride(0), y_desc->stride(1), y_desc->stride(2),
x_desc->stride(0), x_desc->stride(1), x_desc->stride(2)});
}
};
#endif
#include "avg_pool1d_cpu.h"
#include "../../../devices/cpu/common_cpu.h"
#include <algorithm>
namespace op::avg_pool1d::cpu {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t x_desc,
size_t kernel_size,
size_t stride,
size_t padding) {
auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding);
CHECK_RESULT(info);
*desc_ptr = new Descriptor(
info.take(),
0,
nullptr,
handle->device,
handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <typename T>
infiniStatus_t calculateAvgPool1d(const AvgPool1dInfo &info,
T *y,
const T *x) {
const float inv_kernel = 1.0f / static_cast<float>(info.kernel_size);
#pragma omp parallel for
for (ptrdiff_t bc = 0; bc < ptrdiff_t(info.batch * info.channels); ++bc) {
ptrdiff_t b = bc / info.channels;
ptrdiff_t c = bc % info.channels;
size_t y_base = b * info.y_stride_batch + c * info.y_stride_channel;
size_t x_base = b * info.x_stride_batch + c * info.x_stride_channel;
for (size_t ow = 0; ow < info.out_width; ++ow) {
size_t y_offset = y_base + ow * info.y_stride_width;
long long start_w = static_cast<long long>(ow * info.stride) - info.padding;
long long end_w = start_w + info.kernel_size;
long long valid_start = std::max(0LL, start_w);
long long valid_end = std::min(static_cast<long long>(info.in_width), end_w);
float sum = 0.0f;
for (long long iw = valid_start; iw < valid_end; ++iw) {
size_t x_offset = x_base + iw * info.x_stride_width;
sum += utils::cast<float>(x[x_offset]);
}
const float avg = sum * inv_kernel;
y[y_offset] = utils::cast<T>(avg);
}
}
return INFINI_STATUS_SUCCESS;
}
#define CALCULATE(TDATA) calculateAvgPool1d(_info, (TDATA *)y, (const TDATA *)x)
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *y,
const void *x,
void *stream) const {
switch (_info.dtype) {
case INFINI_DTYPE_F16:
return CALCULATE(fp16_t);
case INFINI_DTYPE_BF16:
return CALCULATE(bf16_t);
case INFINI_DTYPE_F32:
return CALCULATE(float);
case INFINI_DTYPE_F64:
return CALCULATE(double);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}
#undef CALCULATE
} // namespace op::avg_pool1d::cpu
#ifndef __INFINIOP_AVG_POOL1D_CPU_H__
#define __INFINIOP_AVG_POOL1D_CPU_H__
#include "../avg_pool1d.h"
DESCRIPTOR(cpu)
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment